Coverage for src/wiktextract/extractor/ruby.py: 86%
77 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from typing import Optional, Union
3from wikitextprocessor import NodeKind, WikiNode
4from wikitextprocessor.parser import (
5 GeneralNode,
6 HTMLNode,
7 LevelNode,
8 TemplateNode,
9)
11from wiktextract.page import clean_node
12from wiktextract.wxr_context import WiktextractContext
15def parse_ruby(
16 wxr: WiktextractContext, node: WikiNode
17) -> Optional[tuple[str, str]]:
18 """Parse a HTML 'ruby' node for a kanji part and a furigana (ruby) part,
19 and return a tuple containing those. Discard the rp-element's parentheses,
20 we don't do anything with them."""
21 ruby_nodes: list[Union[str, WikiNode]] = []
22 furi_nodes: list[Union[str, WikiNode]] = [] # furi_nodes is technically
23 # just list[WikiNode], but this appeases the type-checker for clean_node()
24 for child in node.children:
25 if (
26 not isinstance(child, WikiNode)
27 or child.kind != NodeKind.HTML
28 or child.sarg not in {"rp", "rt"}
29 ):
30 ruby_nodes.append(child)
31 elif child.sarg == "rt":
32 furi_nodes.append(child)
33 ruby_kanji = clean_node(wxr, None, ruby_nodes).strip()
34 furigana = clean_node(wxr, None, furi_nodes).strip()
35 if not ruby_kanji or not furigana: 35 ↛ 40line 35 didn't jump to line 40 because the condition on line 35 was never true
36 # like in パイスラッシュ there can be a template that creates a ruby
37 # element with an empty something (apparently, seeing as how this
38 # works), leaving no trace of the broken ruby element in the final
39 # HTML source of the page!
40 return None
41 return ruby_kanji, furigana
44def extract_ruby(
45 wxr: WiktextractContext,
46 contents: GeneralNode,
47) -> tuple[list[tuple[str, str]], list[Union[WikiNode, str]]]:
48 # If contents is a list, process each element separately
49 extracted = []
50 new_contents = []
51 if isinstance(contents, (list, tuple)):
52 for x in contents:
53 e1, c1 = extract_ruby(wxr, x)
54 extracted.extend(e1)
55 new_contents.extend(c1)
56 return extracted, new_contents
57 # If content is not WikiNode, just return it as new contents.
58 if not isinstance(contents, WikiNode):
59 return [], [contents]
60 # Check if this content should be extracted
61 if contents.kind == NodeKind.HTML and contents.sarg == "ruby":
62 rb = parse_ruby(wxr, contents)
63 if rb is not None: 63 ↛ 66line 63 didn't jump to line 66 because the condition on line 63 was always true
64 return [rb], [rb[0]]
65 # Otherwise content is WikiNode, and we must recurse into it.
66 kind = contents.kind
67 new_node = WikiNode(kind, contents.loc)
68 if kind in {
69 NodeKind.LEVEL2,
70 NodeKind.LEVEL3,
71 NodeKind.LEVEL4,
72 NodeKind.LEVEL5,
73 NodeKind.LEVEL6,
74 NodeKind.LINK,
75 }:
76 # Process args and children
77 if kind != NodeKind.LINK: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 new_node = LevelNode(kind, new_node.loc)
79 new_args = []
80 for arg in contents.largs:
81 e1, c1 = extract_ruby(wxr, arg)
82 new_args.append(c1)
83 extracted.extend(e1)
84 new_node.largs = new_args
85 e1, c1 = extract_ruby(wxr, contents.children)
86 extracted.extend(e1)
87 new_node.children = c1
88 elif kind in {
89 NodeKind.ITALIC,
90 NodeKind.BOLD,
91 NodeKind.TABLE,
92 NodeKind.TABLE_CAPTION,
93 NodeKind.TABLE_ROW,
94 NodeKind.TABLE_HEADER_CELL,
95 NodeKind.TABLE_CELL,
96 NodeKind.PRE,
97 NodeKind.PREFORMATTED,
98 }:
99 # Process only children
100 e1, c1 = extract_ruby(wxr, contents.children)
101 extracted.extend(e1)
102 new_node.children = c1
103 elif kind == NodeKind.HLINE: 103 ↛ 105line 103 didn't jump to line 105 because the condition on line 103 was never true
104 # No arguments or children
105 pass
106 elif kind in (NodeKind.LIST, NodeKind.LIST_ITEM): 106 ↛ 108line 106 didn't jump to line 108 because the condition on line 106 was never true
107 # Keep args as-is, process children
108 new_node.sarg = contents.sarg
109 e1, c1 = extract_ruby(wxr, contents.children)
110 extracted.extend(e1)
111 new_node.children = c1
112 elif kind in {
113 NodeKind.TEMPLATE,
114 NodeKind.TEMPLATE_ARG,
115 NodeKind.PARSER_FN,
116 NodeKind.URL,
117 }:
118 # Process only args
119 if kind == NodeKind.TEMPLATE: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 new_node = TemplateNode(
121 new_node.loc,
122 wxr.wtp.namespace_prefixes(
123 wxr.wtp.NAMESPACE_DATA["Template"]["id"]
124 ),
125 )
126 new_args = []
127 for arg in contents.largs:
128 e1, c1 = extract_ruby(wxr, arg)
129 new_args.append(c1)
130 extracted.extend(e1)
131 new_node.largs = new_args
132 elif kind == NodeKind.HTML: 132 ↛ 141line 132 didn't jump to line 141 because the condition on line 132 was always true
133 # Keep attrs and args as-is, process children
134 new_node = HTMLNode(new_node.loc)
135 new_node.attrs = contents.attrs
136 new_node.sarg = contents.sarg
137 e1, c1 = extract_ruby(wxr, contents.children)
138 extracted.extend(e1)
139 new_node.children = c1
140 else:
141 raise RuntimeError(f"extract_ruby: unhandled kind {kind}")
142 new_contents.append(new_node)
143 return extracted, new_contents