Coverage for src/wiktextract/extractor/ruby.py: 90%

1from typing import Optional, Union

3from wikitextprocessor import NodeKind, WikiNode

4from wikitextprocessor.parser import (

5 GeneralNode,

6 HTMLNode,

7 LevelNode,

8 TemplateNode,

11from wiktextract.page import clean_node

12from wiktextract.wxr_context import WiktextractContext

15def parse_ruby(

16 wxr: WiktextractContext, node: WikiNode

17) -> Optional[tuple[str, str]]:

18 """Parse a HTML 'ruby' node for a kanji part and a furigana (ruby) part,

19 and return a tuple containing those. Discard the rp-element's parentheses,

20 we don't do anything with them."""

21 ruby_nodes: list[Union[str, WikiNode]] = []

22 furi_nodes: list[Union[str, WikiNode]] = [] # furi_nodes is technically

23 # just list[WikiNode], but this appeases the type-checker for clean_node()

24 for child in node.children:

25 if (

26 not isinstance(child, WikiNode)

27 or child.kind != NodeKind.HTML

28 or child.sarg not in {"rp", "rt"}

29 ):

30 ruby_nodes.append(child)

31 elif child.sarg == "rt":

32 furi_nodes.append(child)

33 ruby_kanji = clean_node(wxr, None, ruby_nodes).strip()

34 furigana = clean_node(wxr, None, furi_nodes).strip()

35 if not ruby_kanji or not furigana: 35 ↛ 40line 35 didn't jump to line 40 because the condition on line 35 was never true

36 # like in パイスラッシュ there can be a template that creates a ruby

37 # element with an empty something (apparently, seeing as how this

38 # works), leaving no trace of the broken ruby element in the final

39 # HTML source of the page!

40 return None

41 return ruby_kanji, furigana

44def extract_ruby(

45 wxr: WiktextractContext,

46 contents: GeneralNode,

47) -> tuple[list[tuple[str, str]], list[Union[WikiNode, str]]]:

48 # If contents is a list, process each element separately

49 extracted = []

50 new_contents = []

51 if isinstance(contents, (list, tuple)):

52 for x in contents:

53 e1, c1 = extract_ruby(wxr, x)

54 extracted.extend(e1)

55 new_contents.extend(c1)

56 return extracted, new_contents

57 # If content is not WikiNode, just return it as new contents.

58 if not isinstance(contents, WikiNode):

59 return [], [contents]

60 # Check if this content should be extracted

61 if contents.kind == NodeKind.HTML and contents.sarg == "ruby":

62 rb = parse_ruby(wxr, contents)

63 if rb is not None: 63 ↛ 66line 63 didn't jump to line 66 because the condition on line 63 was always true

64 return [rb], [rb[0]]

65 # Otherwise content is WikiNode, and we must recurse into it.

66 kind = contents.kind

67 new_node = WikiNode(kind, contents.loc)

68 if kind in {

69 NodeKind.LEVEL2,

70 NodeKind.LEVEL3,

71 NodeKind.LEVEL4,

72 NodeKind.LEVEL5,

73 NodeKind.LEVEL6,

74 NodeKind.LINK,

75 }:

76 # Process args and children

77 if kind != NodeKind.LINK: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 new_node = LevelNode(kind, new_node.loc)

79 new_args = []

80 for arg in contents.largs:

81 e1, c1 = extract_ruby(wxr, arg)

82 new_args.append(c1)

83 extracted.extend(e1)

84 new_node.largs = new_args

85 e1, c1 = extract_ruby(wxr, contents.children)

86 extracted.extend(e1)

87 new_node.children = c1

88 elif kind in {

89 NodeKind.ITALIC,

90 NodeKind.BOLD,

91 NodeKind.TABLE,

92 NodeKind.TABLE_CAPTION,

93 NodeKind.TABLE_ROW,

94 NodeKind.TABLE_HEADER_CELL,

95 NodeKind.TABLE_CELL,

96 NodeKind.PRE,

97 NodeKind.PREFORMATTED,

98 }:

99 # Process only children

100 e1, c1 = extract_ruby(wxr, contents.children)

101 extracted.extend(e1)

102 new_node.children = c1

103 elif kind == NodeKind.HLINE: 103 ↛ 105line 103 didn't jump to line 105 because the condition on line 103 was never true

104 # No arguments or children

105 pass

106 elif kind in (NodeKind.LIST, NodeKind.LIST_ITEM):

107 # Keep args as-is, process children

108 new_node.sarg = contents.sarg

109 e1, c1 = extract_ruby(wxr, contents.children)

110 extracted.extend(e1)

111 new_node.children = c1

112 elif kind in {

113 NodeKind.TEMPLATE,

114 NodeKind.TEMPLATE_ARG,

115 NodeKind.PARSER_FN,

116 NodeKind.URL,

117 }:

118 # Process only args

119 if kind == NodeKind.TEMPLATE: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 new_node = TemplateNode(

121 new_node.loc,

122 wxr.wtp.namespace_prefixes(

123 wxr.wtp.NAMESPACE_DATA["Template"]["id"]

124 ),

125 )

126 new_args = []

127 for arg in contents.largs:

128 e1, c1 = extract_ruby(wxr, arg)

129 new_args.append(c1)

130 extracted.extend(e1)

131 new_node.largs = new_args

132 elif kind == NodeKind.HTML: 132 ↛ 141line 132 didn't jump to line 141 because the condition on line 132 was always true

133 # Keep attrs and args as-is, process children

134 new_node = HTMLNode(new_node.loc)

135 new_node.attrs = contents.attrs

136 new_node.sarg = contents.sarg

137 e1, c1 = extract_ruby(wxr, contents.children)

138 extracted.extend(e1)

139 new_node.children = c1

140 else:

141 raise RuntimeError(f"extract_ruby: unhandled kind {kind}")

142 new_contents.append(new_node)

143 return extracted, new_contents