Coverage for src/wiktextract/extractor/ja/example.py: 74%

98 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets 

7from .linkage import ( 

8 LINKAGE_TEMPLATES, 

9 extract_gloss_list_linkage_template, 

10 process_linkage_list_item, 

11) 

12from .models import Example, Sense, WordEntry 

13from .section_titles import LINKAGES 

14 

15 

16def extract_example_list_item( 

17 wxr: WiktextractContext, 

18 word_entry: WordEntry, 

19 sense: Sense, 

20 list_item: WikiNode, 

21 parent_list_text: str = "", 

22) -> None: 

23 # https://ja.wiktionary.org/wiki/Wiktionary:用例#用例を示す形式 

24 

25 # check if it's linkage data 

26 for node_idx, node in enumerate(list_item.children): 

27 if isinstance(node, str) and ":" in node: 

28 linkage_type_text = clean_node( 

29 wxr, None, list_item.children[:node_idx] 

30 ) 

31 if linkage_type_text in LINKAGES: 31 ↛ 26line 31 didn't jump to line 26 because the condition on line 31 was always true

32 process_linkage_list_item( 

33 wxr, 

34 word_entry, 

35 list_item, 

36 "", 

37 sense.glosses[0] if len(sense.glosses) > 0 else "", 

38 ) 

39 return 

40 elif ( 

41 isinstance(node, TemplateNode) 

42 and node.template_name in LINKAGE_TEMPLATES 

43 ): 

44 extract_gloss_list_linkage_template(wxr, word_entry, node) 

45 return 

46 

47 if any( 

48 child.contain_node(NodeKind.BOLD) or child.kind == NodeKind.BOLD 

49 for child in list_item.children 

50 if isinstance(child, WikiNode) and child.kind != NodeKind.LIST 

51 ) or not list_item.contain_node(NodeKind.LIST): 

52 # has bold node or doesn't have list child node 

53 has_example_template = False 

54 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

55 if t_node.template_name in ["ux", "uxi"]: 

56 process_ux_template(wxr, t_node, sense) 

57 has_example_template = True 

58 elif t_node.template_name in ["quote", "quote-book"]: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 extract_quote_template(wxr, t_node, sense) 

60 has_example_template = True 

61 if has_example_template: 

62 return 

63 for bold_index, bold_node in list_item.find_child(NodeKind.BOLD, True): 

64 bold_text = clean_node(wxr, None, bold_node) 

65 if bold_text == "注.": 

66 note = clean_node( 

67 wxr, None, list_item.children[bold_index + 1 :] 

68 ).lstrip(": ") 

69 if note != "": 69 ↛ 71line 69 didn't jump to line 71 because the condition on line 69 was always true

70 sense.notes.append(note) 

71 return 

72 

73 expanded_nodes = wxr.wtp.parse( 

74 wxr.wtp.node_to_wikitext( 

75 list(list_item.invert_find_child(NodeKind.LIST)) 

76 ), 

77 expand_all=True, 

78 ) 

79 ruby, no_ruby = extract_ruby(wxr, expanded_nodes.children) 

80 example = Example(text=clean_node(wxr, None, no_ruby), ruby=ruby) 

81 calculate_bold_offsets( 

82 wxr, 

83 wxr.wtp.parse(wxr.wtp.node_to_wikitext(no_ruby)), 

84 example.text, 

85 example, 

86 "bold_text_offsets", 

87 ) 

88 for tr_list_item in list_item.find_child_recursively( 

89 NodeKind.LIST_ITEM 

90 ): 

91 example.translation = clean_node(wxr, None, tr_list_item.children) 

92 if len(parent_list_text) > 0: 

93 example.ref = parent_list_text 

94 else: 

95 for ref_start_str in ["(", "――"]: 

96 if ref_start_str in example.text: 

97 ref_start = example.text.rindex(ref_start_str) 

98 example.ref = example.text[ref_start:] 

99 example.text = example.text[:ref_start].strip() 

100 for ref_tag in expanded_nodes.find_html_recursively("ref"): 

101 example.ref += " " + clean_node( 

102 wxr, None, ref_tag.children 

103 ) 

104 break 

105 sense.examples.append(example) 

106 else: 

107 list_item_text = clean_node( 

108 wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) 

109 ) 

110 for ref_tag in list_item.find_html("ref"): 

111 list_item_text += " " + clean_node(wxr, None, ref_tag.children) 

112 for next_list_item in list_item.find_child_recursively( 

113 NodeKind.LIST_ITEM 

114 ): 

115 extract_example_list_item( 

116 wxr, word_entry, sense, next_list_item, list_item_text 

117 ) 

118 

119 

120def process_ux_template( 

121 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

122) -> None: 

123 # https://ja.wiktionary.org/wiki/テンプレート:ux 

124 # https://ja.wiktionary.org/wiki/テンプレート:uxi 

125 example = Example() 

126 expanded_node = wxr.wtp.parse( 

127 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

128 ) 

129 for i_tag in expanded_node.find_html_recursively("i"): 

130 i_tag_class = i_tag.attrs.get("class", "") 

131 if "e-example" in i_tag_class: 

132 example.text = clean_node(wxr, None, i_tag) 

133 calculate_bold_offsets( 

134 wxr, i_tag, example.text, example, "bold_text_offsets" 

135 ) 

136 elif "e-transliteration" in i_tag_class: 136 ↛ 129line 136 didn't jump to line 129 because the condition on line 136 was always true

137 example.roman = clean_node(wxr, None, i_tag) 

138 calculate_bold_offsets( 

139 wxr, i_tag, example.roman, example, "bold_roman_offsets" 

140 ) 

141 for span_tag in expanded_node.find_html_recursively("span"): 

142 span_tag_class = span_tag.attrs.get("class", "") 

143 if "e-translation" in span_tag_class: 143 ↛ 141line 143 didn't jump to line 141 because the condition on line 143 was always true

144 example.translation = clean_node(wxr, None, span_tag) 

145 calculate_bold_offsets( 

146 wxr, 

147 span_tag, 

148 example.translation, 

149 example, 

150 "bold_translation_offsets", 

151 ) 

152 if example.text != "": 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was always true

153 sense.examples.append(example) 

154 clean_node(wxr, sense, t_node) 

155 

156 

157def extract_quote_template( 

158 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

159) -> None: 

160 # https://ja.wiktionary.org/wiki/テンプレート:quote 

161 example = Example() 

162 expanded_node = wxr.wtp.parse( 

163 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

164 ) 

165 for span_tag in expanded_node.find_html_recursively("span"): 

166 span_tag_class = span_tag.attrs.get("class", "") 

167 if " e-quotation" in span_tag_class: 

168 example.text = clean_node(wxr, None, span_tag) 

169 calculate_bold_offsets( 

170 wxr, 

171 span_tag, 

172 example.text, 

173 example, 

174 "bold_text_offsets", 

175 ) 

176 elif "e-transliteration" in span_tag_class: 

177 example.roman = clean_node(wxr, None, span_tag) 

178 calculate_bold_offsets( 

179 wxr, 

180 span_tag, 

181 example.roman, 

182 example, 

183 "bold_roman_offsets", 

184 ) 

185 elif "e-translation" in span_tag_class: 

186 example.translation = clean_node(wxr, None, span_tag) 

187 calculate_bold_offsets( 

188 wxr, 

189 span_tag, 

190 example.translation, 

191 example, 

192 "bold_translation_offsets", 

193 ) 

194 elif "cited-source" in span_tag_class: 

195 example.ref = clean_node(wxr, None, span_tag) 

196 

197 for ref_tag in expanded_node.find_html_recursively("ref"): 

198 example.ref = clean_node(wxr, None, ref_tag.children) 

199 

200 if example.text != "": 

201 sense.examples.append(example) 

202 clean_node(wxr, sense, t_node)