Coverage for src/wiktextract/extractor/ja/example.py: 74%

1from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode

3from ...page import clean_node

4from ...wxr_context import WiktextractContext

5from ..ruby import extract_ruby

6from ..share import calculate_bold_offsets

7from .linkage import (

8 LINKAGE_TEMPLATES,

9 extract_gloss_list_linkage_template,

10 process_linkage_list_item,

11)

12from .models import Example, Sense, WordEntry

13from .section_titles import LINKAGES

16def extract_example_list_item(

17 wxr: WiktextractContext,

18 word_entry: WordEntry,

19 sense: Sense,

20 list_item: WikiNode,

21 parent_list_text: str = "",

22) -> None:

23 # https://ja.wiktionary.org/wiki/Wiktionary:用例#用例を示す形式

25 # check if it's linkage data

26 for node_idx, node in enumerate(list_item.children):

27 if isinstance(node, str) and ":" in node:

28 linkage_type_text = clean_node(

29 wxr, None, list_item.children[:node_idx]

30 )

31 if linkage_type_text in LINKAGES: 31 ↛ 26line 31 didn't jump to line 26 because the condition on line 31 was always true

32 process_linkage_list_item(

33 wxr,

34 word_entry,

35 list_item,

36 "",

37 sense.glosses[0] if len(sense.glosses) > 0 else "",

38 )

39 return

40 elif (

41 isinstance(node, TemplateNode)

42 and node.template_name in LINKAGE_TEMPLATES

43 ):

44 extract_gloss_list_linkage_template(wxr, word_entry, node)

45 return

47 if any(

48 child.contain_node(NodeKind.BOLD) or child.kind == NodeKind.BOLD

49 for child in list_item.children

50 if isinstance(child, WikiNode) and child.kind != NodeKind.LIST

51 ) or not list_item.contain_node(NodeKind.LIST):

52 # has bold node or doesn't have list child node

53 has_example_template = False

54 for t_node in list_item.find_child(NodeKind.TEMPLATE):

55 if t_node.template_name in ["ux", "uxi"]:

56 process_ux_template(wxr, t_node, sense)

57 has_example_template = True

58 elif t_node.template_name in ["quote", "quote-book"]: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 extract_quote_template(wxr, t_node, sense)

60 has_example_template = True

61 if has_example_template:

62 return

63 for bold_index, bold_node in list_item.find_child(NodeKind.BOLD, True):

64 bold_text = clean_node(wxr, None, bold_node)

65 if bold_text == "注．":

66 note = clean_node(

67 wxr, None, list_item.children[bold_index + 1 :]

68 ).lstrip(": ")

69 if note != "": 69 ↛ 71line 69 didn't jump to line 71 because the condition on line 69 was always true

70 sense.notes.append(note)

71 return

73 expanded_nodes = wxr.wtp.parse(

74 wxr.wtp.node_to_wikitext(

75 list(list_item.invert_find_child(NodeKind.LIST))

76 ),

77 expand_all=True,

78 )

79 ruby, no_ruby = extract_ruby(wxr, expanded_nodes.children)

80 example = Example(text=clean_node(wxr, None, no_ruby), ruby=ruby)

81 calculate_bold_offsets(

82 wxr,

83 wxr.wtp.parse(wxr.wtp.node_to_wikitext(no_ruby)),

84 example.text,

85 example,

86 "bold_text_offsets",

87 )

88 for tr_list_item in list_item.find_child_recursively(

89 NodeKind.LIST_ITEM

90 ):

91 example.translation = clean_node(wxr, None, tr_list_item.children)

92 if len(parent_list_text) > 0:

93 example.ref = parent_list_text

94 else:

95 for ref_start_str in ["（", "――"]:

96 if ref_start_str in example.text:

97 ref_start = example.text.rindex(ref_start_str)

98 example.ref = example.text[ref_start:]

99 example.text = example.text[:ref_start].strip()

100 for ref_tag in expanded_nodes.find_html_recursively("ref"):

101 example.ref += " " + clean_node(

102 wxr, None, ref_tag.children

103 )

104 break

105 sense.examples.append(example)

106 else:

107 list_item_text = clean_node(

108 wxr, None, list(list_item.invert_find_child(NodeKind.LIST))

109 )

110 for ref_tag in list_item.find_html("ref"):

111 list_item_text += " " + clean_node(wxr, None, ref_tag.children)

112 for next_list_item in list_item.find_child_recursively(

113 NodeKind.LIST_ITEM

114 ):

115 extract_example_list_item(

116 wxr, word_entry, sense, next_list_item, list_item_text

117 )

118

119

120def process_ux_template(

121 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense

122) -> None:

123 # https://ja.wiktionary.org/wiki/テンプレート:ux

124 # https://ja.wiktionary.org/wiki/テンプレート:uxi

125 example = Example()

126 expanded_node = wxr.wtp.parse(

127 wxr.wtp.node_to_wikitext(t_node), expand_all=True

128 )

129 for i_tag in expanded_node.find_html_recursively("i"):

130 i_tag_class = i_tag.attrs.get("class", "")

131 if "e-example" in i_tag_class:

132 example.text = clean_node(wxr, None, i_tag)

133 calculate_bold_offsets(

134 wxr, i_tag, example.text, example, "bold_text_offsets"

135 )

136 elif "e-transliteration" in i_tag_class: 136 ↛ 129line 136 didn't jump to line 129 because the condition on line 136 was always true

137 example.roman = clean_node(wxr, None, i_tag)

138 calculate_bold_offsets(

139 wxr, i_tag, example.roman, example, "bold_roman_offsets"

140 )

141 for span_tag in expanded_node.find_html_recursively("span"):

142 span_tag_class = span_tag.attrs.get("class", "")

143 if "e-translation" in span_tag_class: 143 ↛ 141line 143 didn't jump to line 141 because the condition on line 143 was always true

144 example.translation = clean_node(wxr, None, span_tag)

145 calculate_bold_offsets(

146 wxr,

147 span_tag,

148 example.translation,

149 example,

150 "bold_translation_offsets",

151 )

152 if example.text != "": 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was always true

153 sense.examples.append(example)

154 clean_node(wxr, sense, t_node)

155

156

157def extract_quote_template(

158 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense

159) -> None:

160 # https://ja.wiktionary.org/wiki/テンプレート:quote

161 example = Example()

162 expanded_node = wxr.wtp.parse(

163 wxr.wtp.node_to_wikitext(t_node), expand_all=True

164 )

165 for span_tag in expanded_node.find_html_recursively("span"):

166 span_tag_class = span_tag.attrs.get("class", "")

167 if " e-quotation" in span_tag_class:

168 example.text = clean_node(wxr, None, span_tag)

169 calculate_bold_offsets(

170 wxr,

171 span_tag,

172 example.text,

173 example,

174 "bold_text_offsets",

175 )

176 elif "e-transliteration" in span_tag_class:

177 example.roman = clean_node(wxr, None, span_tag)

178 calculate_bold_offsets(

179 wxr,

180 span_tag,

181 example.roman,

182 example,

183 "bold_roman_offsets",

184 )

185 elif "e-translation" in span_tag_class:

186 example.translation = clean_node(wxr, None, span_tag)

187 calculate_bold_offsets(

188 wxr,

189 span_tag,

190 example.translation,

191 example,

192 "bold_translation_offsets",

193 )

194 elif "cited-source" in span_tag_class:

195 example.ref = clean_node(wxr, None, span_tag)

196

197 for ref_tag in expanded_node.find_html_recursively("ref"):

198 example.ref = clean_node(wxr, None, ref_tag.children)

199

200 if example.text != "":

201 sense.examples.append(example)

202 clean_node(wxr, sense, t_node)