Coverage for src/wiktextract/extractor/zh/etymology.py: 50%

1from wikitextprocessor import (

2 HTMLNode,

3 LevelNode,

4 NodeKind,

5 TemplateNode,

6 WikiNode,

9from ...page import clean_node

10from ...wxr_context import WiktextractContext

11from .models import Example, Form, WordEntry

12from .tags import translate_raw_tags

15def extract_etymology_section(

16 wxr: WiktextractContext,

17 page_data: list[WordEntry],

18 base_data: WordEntry,

19 level_node: WikiNode,

20):

21 from .example import extract_template_zh_x

23 e_nodes = []

24 for node in level_node.children:

25 if isinstance(node, TemplateNode) and node.template_name in [

26 "zh-x",

27 "zh-q",

28 ]:

29 for example_data in extract_template_zh_x(

30 wxr, node, Example(text="")

31 ):

32 base_data.etymology_examples.append(example_data)

33 clean_node(wxr, base_data, node)

34 elif isinstance(node, TemplateNode) and node.template_name.lower() in [ 34 ↛ 41line 34 didn't jump to line 41 because the condition on line 34 was never true

35 "rfe", # missing etymology

36 "zh-forms",

37 "zh-wp",

38 "wp",

39 "wikipedia",

40 ]:

41 continue

42 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

43 has_zh_x = False

44 for template_node in node.find_child_recursively(NodeKind.TEMPLATE):

45 if template_node.template_name in ["zh-x", "zh-q"]:

46 has_zh_x = True

47 for example_data in extract_template_zh_x(

48 wxr, template_node, Example(text="")

49 ):

50 base_data.etymology_examples.append(example_data)

51 clean_node(wxr, base_data, template_node)

52 if not has_zh_x:

53 for list_item in node.find_child(NodeKind.LIST_ITEM):

54 e_text = clean_node(wxr, None, list_item.children)

55 if len(e_text) > 0: 55 ↛ 53line 55 didn't jump to line 53 because the condition on line 55 was always true

56 base_data.etymology_texts.append(e_text)

57 elif isinstance(node, TemplateNode) and node.template_name in [ 57 ↛ 62line 57 didn't jump to line 62 because the condition on line 57 was never true

58 "ja-see",

59 "ja-see-kango",

60 "zh-see",

61 ]:

62 from .page import process_soft_redirect_template

64 page_data.append(base_data.model_copy(deep=True))

65 process_soft_redirect_template(wxr, node, page_data[-1])

66 elif isinstance(node, TemplateNode) and ( 66 ↛ 70line 66 didn't jump to line 70 because the condition on line 66 was never true

67 node.template_name.endswith("-kanjitab")

68 or node.template_name == "ja-kt"

69 ):

70 extract_ja_kanjitab_template(wxr, node, base_data)

71 elif isinstance(node, LevelNode):

72 break

73 else:

74 e_nodes.append(node)

76 if len(e_nodes) > 0: 76 ↛ exitline 76 didn't return from function 'extract_etymology_section' because the condition on line 76 was always true

77 etymology_text = clean_node(wxr, base_data, e_nodes)

78 if len(etymology_text) > 0:

79 base_data.etymology_texts.append(etymology_text)

82def extract_ja_kanjitab_template(

83 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry

84):

85 # https://zh.wiktionary.org/wiki/Template:ja-kanjitab

86 expanded_node = wxr.wtp.parse(

87 wxr.wtp.node_to_wikitext(t_node), expand_all=True

88 )

89 for table in expanded_node.find_child(NodeKind.TABLE):

90 is_alt_form_table = False

91 for row in table.find_child(NodeKind.TABLE_ROW):

92 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):

93 header_text = clean_node(wxr, None, header_node)

94 if header_text == "其他表記":

95 is_alt_form_table = True

96 if not is_alt_form_table:

97 continue

98 forms = []

99 for row in table.find_child(NodeKind.TABLE_ROW):

100 for cell_node in row.find_child(NodeKind.TABLE_CELL):

101 for child_node in cell_node.children:

102 if isinstance(child_node, HTMLNode):

103 if child_node.tag == "span":

104 word = clean_node(wxr, None, child_node)

105 if word != "":

106 forms.append(

107 Form(

108 form=word, tags=["alternative", "kanji"]

109 )

110 )

111 elif child_node.tag == "small":

112 raw_tag = clean_node(wxr, None, child_node).strip(

113 "()"

114 )

115 if raw_tag != "" and len(forms) > 0:

116 forms[-1].raw_tags.append(raw_tag)

117 translate_raw_tags(forms[-1])

118 base_data.forms.extend(forms)

119 for link_node in expanded_node.find_child(NodeKind.LINK):

120 clean_node(wxr, base_data, link_node)

Coverage for src / wiktextract / extractor / zh / etymology.py: 50%

69 statements