Coverage for src / wiktextract / extractor / zh / etymology.py: 50%

69 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1from wikitextprocessor import ( 

2 HTMLNode, 

3 LevelNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .models import Example, Form, WordEntry 

12from .tags import translate_raw_tags 

13 

14 

15def extract_etymology_section( 

16 wxr: WiktextractContext, 

17 page_data: list[WordEntry], 

18 base_data: WordEntry, 

19 level_node: WikiNode, 

20): 

21 from .example import extract_template_zh_x 

22 

23 e_nodes = [] 

24 for node in level_node.children: 

25 if isinstance(node, TemplateNode) and node.template_name in [ 

26 "zh-x", 

27 "zh-q", 

28 ]: 

29 for example_data in extract_template_zh_x( 

30 wxr, node, Example(text="") 

31 ): 

32 base_data.etymology_examples.append(example_data) 

33 clean_node(wxr, base_data, node) 

34 elif isinstance(node, TemplateNode) and node.template_name.lower() in [ 34 ↛ 41line 34 didn't jump to line 41 because the condition on line 34 was never true

35 "rfe", # missing etymology 

36 "zh-forms", 

37 "zh-wp", 

38 "wp", 

39 "wikipedia", 

40 ]: 

41 continue 

42 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

43 has_zh_x = False 

44 for template_node in node.find_child_recursively(NodeKind.TEMPLATE): 

45 if template_node.template_name in ["zh-x", "zh-q"]: 

46 has_zh_x = True 

47 for example_data in extract_template_zh_x( 

48 wxr, template_node, Example(text="") 

49 ): 

50 base_data.etymology_examples.append(example_data) 

51 clean_node(wxr, base_data, template_node) 

52 if not has_zh_x: 

53 for list_item in node.find_child(NodeKind.LIST_ITEM): 

54 e_text = clean_node(wxr, None, list_item.children) 

55 if len(e_text) > 0: 55 ↛ 53line 55 didn't jump to line 53 because the condition on line 55 was always true

56 base_data.etymology_texts.append(e_text) 

57 elif isinstance(node, TemplateNode) and node.template_name in [ 57 ↛ 62line 57 didn't jump to line 62 because the condition on line 57 was never true

58 "ja-see", 

59 "ja-see-kango", 

60 "zh-see", 

61 ]: 

62 from .page import process_soft_redirect_template 

63 

64 page_data.append(base_data.model_copy(deep=True)) 

65 process_soft_redirect_template(wxr, node, page_data[-1]) 

66 elif isinstance(node, TemplateNode) and ( 66 ↛ 70line 66 didn't jump to line 70 because the condition on line 66 was never true

67 node.template_name.endswith("-kanjitab") 

68 or node.template_name == "ja-kt" 

69 ): 

70 extract_ja_kanjitab_template(wxr, node, base_data) 

71 elif isinstance(node, LevelNode): 

72 break 

73 else: 

74 e_nodes.append(node) 

75 

76 if len(e_nodes) > 0: 76 ↛ exitline 76 didn't return from function 'extract_etymology_section' because the condition on line 76 was always true

77 etymology_text = clean_node(wxr, base_data, e_nodes) 

78 if len(etymology_text) > 0: 

79 base_data.etymology_texts.append(etymology_text) 

80 

81 

82def extract_ja_kanjitab_template( 

83 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry 

84): 

85 # https://zh.wiktionary.org/wiki/Template:ja-kanjitab 

86 expanded_node = wxr.wtp.parse( 

87 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

88 ) 

89 for table in expanded_node.find_child(NodeKind.TABLE): 

90 is_alt_form_table = False 

91 for row in table.find_child(NodeKind.TABLE_ROW): 

92 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

93 header_text = clean_node(wxr, None, header_node) 

94 if header_text == "其他表記": 

95 is_alt_form_table = True 

96 if not is_alt_form_table: 

97 continue 

98 forms = [] 

99 for row in table.find_child(NodeKind.TABLE_ROW): 

100 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

101 for child_node in cell_node.children: 

102 if isinstance(child_node, HTMLNode): 

103 if child_node.tag == "span": 

104 word = clean_node(wxr, None, child_node) 

105 if word != "": 

106 forms.append( 

107 Form( 

108 form=word, tags=["alternative", "kanji"] 

109 ) 

110 ) 

111 elif child_node.tag == "small": 

112 raw_tag = clean_node(wxr, None, child_node).strip( 

113 "()" 

114 ) 

115 if raw_tag != "" and len(forms) > 0: 

116 forms[-1].raw_tags.append(raw_tag) 

117 translate_raw_tags(forms[-1]) 

118 base_data.forms.extend(forms) 

119 for link_node in expanded_node.find_child(NodeKind.LINK): 

120 clean_node(wxr, base_data, link_node)