Coverage for src/wiktextract/extractor/ruby.py: 86%

77 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from typing import Optional, Union 

2 

3from wikitextprocessor import NodeKind, WikiNode 

4from wikitextprocessor.parser import ( 

5 GeneralNode, 

6 HTMLNode, 

7 LevelNode, 

8 TemplateNode, 

9) 

10 

11from wiktextract.page import clean_node 

12from wiktextract.wxr_context import WiktextractContext 

13 

14 

15def parse_ruby( 

16 wxr: WiktextractContext, node: WikiNode 

17) -> Optional[tuple[str, str]]: 

18 """Parse a HTML 'ruby' node for a kanji part and a furigana (ruby) part, 

19 and return a tuple containing those. Discard the rp-element's parentheses, 

20 we don't do anything with them.""" 

21 ruby_nodes: list[Union[str, WikiNode]] = [] 

22 furi_nodes: list[Union[str, WikiNode]] = [] # furi_nodes is technically 

23 # just list[WikiNode], but this appeases the type-checker for clean_node() 

24 for child in node.children: 

25 if ( 

26 not isinstance(child, WikiNode) 

27 or child.kind != NodeKind.HTML 

28 or child.sarg not in {"rp", "rt"} 

29 ): 

30 ruby_nodes.append(child) 

31 elif child.sarg == "rt": 

32 furi_nodes.append(child) 

33 ruby_kanji = clean_node(wxr, None, ruby_nodes).strip() 

34 furigana = clean_node(wxr, None, furi_nodes).strip() 

35 if not ruby_kanji or not furigana: 35 ↛ 40line 35 didn't jump to line 40 because the condition on line 35 was never true

36 # like in パイスラッシュ there can be a template that creates a ruby 

37 # element with an empty something (apparently, seeing as how this 

38 # works), leaving no trace of the broken ruby element in the final 

39 # HTML source of the page! 

40 return None 

41 return ruby_kanji, furigana 

42 

43 

44def extract_ruby( 

45 wxr: WiktextractContext, 

46 contents: GeneralNode, 

47) -> tuple[list[tuple[str, str]], list[Union[WikiNode, str]]]: 

48 # If contents is a list, process each element separately 

49 extracted = [] 

50 new_contents = [] 

51 if isinstance(contents, (list, tuple)): 

52 for x in contents: 

53 e1, c1 = extract_ruby(wxr, x) 

54 extracted.extend(e1) 

55 new_contents.extend(c1) 

56 return extracted, new_contents 

57 # If content is not WikiNode, just return it as new contents. 

58 if not isinstance(contents, WikiNode): 

59 return [], [contents] 

60 # Check if this content should be extracted 

61 if contents.kind == NodeKind.HTML and contents.sarg == "ruby": 

62 rb = parse_ruby(wxr, contents) 

63 if rb is not None: 63 ↛ 66line 63 didn't jump to line 66 because the condition on line 63 was always true

64 return [rb], [rb[0]] 

65 # Otherwise content is WikiNode, and we must recurse into it. 

66 kind = contents.kind 

67 new_node = WikiNode(kind, contents.loc) 

68 if kind in { 

69 NodeKind.LEVEL2, 

70 NodeKind.LEVEL3, 

71 NodeKind.LEVEL4, 

72 NodeKind.LEVEL5, 

73 NodeKind.LEVEL6, 

74 NodeKind.LINK, 

75 }: 

76 # Process args and children 

77 if kind != NodeKind.LINK: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 new_node = LevelNode(kind, new_node.loc) 

79 new_args = [] 

80 for arg in contents.largs: 

81 e1, c1 = extract_ruby(wxr, arg) 

82 new_args.append(c1) 

83 extracted.extend(e1) 

84 new_node.largs = new_args 

85 e1, c1 = extract_ruby(wxr, contents.children) 

86 extracted.extend(e1) 

87 new_node.children = c1 

88 elif kind in { 

89 NodeKind.ITALIC, 

90 NodeKind.BOLD, 

91 NodeKind.TABLE, 

92 NodeKind.TABLE_CAPTION, 

93 NodeKind.TABLE_ROW, 

94 NodeKind.TABLE_HEADER_CELL, 

95 NodeKind.TABLE_CELL, 

96 NodeKind.PRE, 

97 NodeKind.PREFORMATTED, 

98 }: 

99 # Process only children 

100 e1, c1 = extract_ruby(wxr, contents.children) 

101 extracted.extend(e1) 

102 new_node.children = c1 

103 elif kind == NodeKind.HLINE: 103 ↛ 105line 103 didn't jump to line 105 because the condition on line 103 was never true

104 # No arguments or children 

105 pass 

106 elif kind in (NodeKind.LIST, NodeKind.LIST_ITEM): 106 ↛ 108line 106 didn't jump to line 108 because the condition on line 106 was never true

107 # Keep args as-is, process children 

108 new_node.sarg = contents.sarg 

109 e1, c1 = extract_ruby(wxr, contents.children) 

110 extracted.extend(e1) 

111 new_node.children = c1 

112 elif kind in { 

113 NodeKind.TEMPLATE, 

114 NodeKind.TEMPLATE_ARG, 

115 NodeKind.PARSER_FN, 

116 NodeKind.URL, 

117 }: 

118 # Process only args 

119 if kind == NodeKind.TEMPLATE: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 new_node = TemplateNode( 

121 new_node.loc, 

122 wxr.wtp.namespace_prefixes( 

123 wxr.wtp.NAMESPACE_DATA["Template"]["id"] 

124 ), 

125 ) 

126 new_args = [] 

127 for arg in contents.largs: 

128 e1, c1 = extract_ruby(wxr, arg) 

129 new_args.append(c1) 

130 extracted.extend(e1) 

131 new_node.largs = new_args 

132 elif kind == NodeKind.HTML: 132 ↛ 141line 132 didn't jump to line 141 because the condition on line 132 was always true

133 # Keep attrs and args as-is, process children 

134 new_node = HTMLNode(new_node.loc) 

135 new_node.attrs = contents.attrs 

136 new_node.sarg = contents.sarg 

137 e1, c1 = extract_ruby(wxr, contents.children) 

138 extracted.extend(e1) 

139 new_node.children = c1 

140 else: 

141 raise RuntimeError(f"extract_ruby: unhandled kind {kind}") 

142 new_contents.append(new_node) 

143 return extracted, new_contents