Coverage for src/wiktextract/extractor/ja/example.py: 74%

98 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets 

7from .linkage import ( 

8 LINKAGE_TEMPLATES, 

9 extract_gloss_list_linkage_template, 

10 process_linkage_list_item, 

11) 

12from .models import Example, Sense, WordEntry 

13from .section_titles import LINKAGES 

14 

15 

16def extract_example_list_item( 

17 wxr: WiktextractContext, 

18 word_entry: WordEntry, 

19 sense: Sense, 

20 list_item: WikiNode, 

21 parent_list_text: str = "", 

22) -> None: 

23 # https://ja.wiktionary.org/wiki/Wiktionary:用例#用例を示す形式 

24 

25 # check if it's linkage data 

26 for node_idx, node in enumerate(list_item.children): 

27 if isinstance(node, str) and ":" in node: 

28 linkage_type_text = clean_node( 

29 wxr, None, list_item.children[:node_idx] 

30 ) 

31 if linkage_type_text in LINKAGES: 31 ↛ 26line 31 didn't jump to line 26 because the condition on line 31 was always true

32 process_linkage_list_item( 

33 wxr, 

34 word_entry, 

35 list_item, 

36 "", 

37 sense.glosses[0] if len(sense.glosses) > 0 else "", 

38 ) 

39 return 

40 elif ( 

41 isinstance(node, TemplateNode) 

42 and node.template_name in LINKAGE_TEMPLATES 

43 ): 

44 extract_gloss_list_linkage_template(wxr, word_entry, node) 

45 return 

46 

47 if any( 

48 child.contain_node(NodeKind.BOLD) or child.kind == NodeKind.BOLD 

49 for child in list_item.children 

50 if isinstance(child, WikiNode) and child.kind != NodeKind.LIST 

51 ) or not list_item.contain_node(NodeKind.LIST): 

52 # has bold node or doesn't have list child node 

53 has_example_template = False 

54 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

55 if t_node.template_name in ["ux", "uxi"]: 

56 process_ux_template(wxr, t_node, sense) 

57 has_example_template = True 

58 elif t_node.template_name in ["quote", "quote-book"]: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 extract_quote_template(wxr, t_node, sense) 

60 has_example_template = True 

61 if has_example_template: 

62 return 

63 for bold_index, bold_node in list_item.find_child(NodeKind.BOLD, True): 

64 bold_text = clean_node(wxr, None, bold_node) 

65 if bold_text == "注.": 

66 note = clean_node( 

67 wxr, None, list_item.children[bold_index + 1 :] 

68 ).lstrip(": ") 

69 if note != "": 69 ↛ 71line 69 didn't jump to line 71 because the condition on line 69 was always true

70 sense.notes.append(note) 

71 return 

72 

73 expanded_nodes = wxr.wtp.parse( 

74 wxr.wtp.node_to_wikitext( 

75 list( 

76 list_item.invert_find_child( 

77 NodeKind.LIST, include_empty_str=True 

78 ) 

79 ) 

80 ), 

81 expand_all=True, 

82 ) 

83 ruby, no_ruby = extract_ruby(wxr, expanded_nodes.children) 

84 example = Example(text=clean_node(wxr, None, no_ruby), ruby=ruby) 

85 calculate_bold_offsets( 

86 wxr, 

87 wxr.wtp.parse(wxr.wtp.node_to_wikitext(no_ruby)), 

88 example.text, 

89 example, 

90 "bold_text_offsets", 

91 ) 

92 for tr_list_item in list_item.find_child_recursively( 

93 NodeKind.LIST_ITEM 

94 ): 

95 example.translation = clean_node(wxr, None, tr_list_item.children) 

96 if len(parent_list_text) > 0: 

97 example.ref = parent_list_text 

98 else: 

99 for ref_start_str in ["(", "――"]: 

100 if ref_start_str in example.text: 

101 ref_start = example.text.rindex(ref_start_str) 

102 example.ref = example.text[ref_start:] 

103 example.text = example.text[:ref_start].strip() 

104 for ref_tag in expanded_nodes.find_html_recursively("ref"): 

105 example.ref += " " + clean_node( 

106 wxr, None, ref_tag.children 

107 ) 

108 break 

109 sense.examples.append(example) 

110 else: 

111 list_item_text = clean_node( 

112 wxr, 

113 None, 

114 list( 

115 list_item.invert_find_child( 

116 NodeKind.LIST, include_empty_str=True 

117 ) 

118 ), 

119 ) 

120 for ref_tag in list_item.find_html("ref"): 

121 list_item_text += " " + clean_node(wxr, None, ref_tag.children) 

122 for next_list_item in list_item.find_child_recursively( 

123 NodeKind.LIST_ITEM 

124 ): 

125 extract_example_list_item( 

126 wxr, word_entry, sense, next_list_item, list_item_text 

127 ) 

128 

129 

130def process_ux_template( 

131 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

132) -> None: 

133 # https://ja.wiktionary.org/wiki/テンプレート:ux 

134 # https://ja.wiktionary.org/wiki/テンプレート:uxi 

135 example = Example() 

136 expanded_node = wxr.wtp.parse( 

137 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

138 ) 

139 for i_tag in expanded_node.find_html_recursively("i"): 

140 i_tag_class = i_tag.attrs.get("class", "") 

141 if "e-example" in i_tag_class: 

142 example.text = clean_node(wxr, None, i_tag) 

143 calculate_bold_offsets( 

144 wxr, i_tag, example.text, example, "bold_text_offsets" 

145 ) 

146 elif "e-transliteration" in i_tag_class: 146 ↛ 139line 146 didn't jump to line 139 because the condition on line 146 was always true

147 example.roman = clean_node(wxr, None, i_tag) 

148 calculate_bold_offsets( 

149 wxr, i_tag, example.roman, example, "bold_roman_offsets" 

150 ) 

151 for span_tag in expanded_node.find_html_recursively("span"): 

152 span_tag_class = span_tag.attrs.get("class", "") 

153 if "e-translation" in span_tag_class: 153 ↛ 151line 153 didn't jump to line 151 because the condition on line 153 was always true

154 example.translation = clean_node(wxr, None, span_tag) 

155 calculate_bold_offsets( 

156 wxr, 

157 span_tag, 

158 example.translation, 

159 example, 

160 "bold_translation_offsets", 

161 ) 

162 if example.text != "": 162 ↛ 164line 162 didn't jump to line 164 because the condition on line 162 was always true

163 sense.examples.append(example) 

164 clean_node(wxr, sense, t_node) 

165 

166 

167def extract_quote_template( 

168 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

169) -> None: 

170 # https://ja.wiktionary.org/wiki/テンプレート:quote 

171 example = Example() 

172 expanded_node = wxr.wtp.parse( 

173 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

174 ) 

175 for span_tag in expanded_node.find_html_recursively("span"): 

176 span_tag_class = span_tag.attrs.get("class", "") 

177 if " e-quotation" in span_tag_class: 

178 example.text = clean_node(wxr, None, span_tag) 

179 calculate_bold_offsets( 

180 wxr, 

181 span_tag, 

182 example.text, 

183 example, 

184 "bold_text_offsets", 

185 ) 

186 elif "e-transliteration" in span_tag_class: 

187 example.roman = clean_node(wxr, None, span_tag) 

188 calculate_bold_offsets( 

189 wxr, 

190 span_tag, 

191 example.roman, 

192 example, 

193 "bold_roman_offsets", 

194 ) 

195 elif "e-translation" in span_tag_class: 

196 example.translation = clean_node(wxr, None, span_tag) 

197 calculate_bold_offsets( 

198 wxr, 

199 span_tag, 

200 example.translation, 

201 example, 

202 "bold_translation_offsets", 

203 ) 

204 elif "cited-source" in span_tag_class: 

205 example.ref = clean_node(wxr, None, span_tag) 

206 

207 for ref_tag in expanded_node.find_html_recursively("ref"): 

208 example.ref = clean_node(wxr, None, ref_tag.children) 

209 

210 if example.text != "": 

211 sense.examples.append(example) 

212 clean_node(wxr, sense, t_node)