Coverage for src/wiktextract/extractor/zh/ 91%

113 statements  

« prev     ^ index     » next v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2from typing import Union 


4from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 


6from import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from ..share import strip_nodes 

10from .models import Form, WordEntry 

11from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags 



14def extract_headword_line_template( 

15 wxr: WiktextractContext, 

16 page_data: list[WordEntry], 

17 node: TemplateNode, 

18 lang_code: str, 

19) -> None: 

20 # handle the first template in header line 

21 template_name = node.template_name 

22 if ( 22 ↛ 26line 22 didn't jump to line 26 because the condition on line 22 was never true

23 template_name != "head" 

24 and not template_name.startswith(f"{lang_code}-") 

25 ) or template_name.endswith("-see"): 

26 return 


28 expanded_node = wxr.wtp.parse( 

29 wxr.wtp.node_to_wikitext(node), expand_all=True 

30 ) 

31 clean_node(wxr, page_data[-1], expanded_node) 

32 forms_start_index = 0 

33 for span_node in expanded_node.find_html( 

34 "span", attr_name="class", attr_value="headword-line" 

35 ): 

36 for index, span_child in span_node.find_child(NodeKind.HTML, True): 

37 if span_child.tag == "span": 

38 forms_start_index = index + 1 

39 class_names = span_child.attrs.get("class", "") 

40 if "headword-tr" in class_names: 

41 page_data[-1].forms.append( 

42 Form( 

43 form=clean_node(wxr, page_data[-1], span_child), 

44 tags=["romanization"], 

45 ) 

46 ) 

47 elif "gender" in class_names: 

48 for abbr_tag in span_child.find_html("abbr"): 

49 gender = abbr_tag.children[0] 

50 if gender in TEMPLATE_TAG_ARGS: 50 ↛ 53line 50 didn't jump to line 53 because the condition on line 50 was always true

51 page_data[-1].tags.append(TEMPLATE_TAG_ARGS[gender]) 

52 else: 

53 page_data[-1].raw_tags.append(gender) 

54 translate_raw_tags(page_data[-1]) 

55 else: 

56 for strong_node in span_child.find_html( 

57 "strong", attr_name="class", attr_value="headword" 

58 ): 

59 process_ja_headword(wxr, page_data, strong_node) 

60 elif ( 

61 span_child.tag == "strong" 

62 and "headword" in span_child.attrs.get("class", "") 

63 ): 

64 forms_start_index = index + 1 

65 if lang_code == "ja": 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true

66 process_ja_headword(wxr, page_data, span_child) 

67 elif span_child.tag == "b": 

68 # this is a form <b> tag, already inside form parentheses 

69 break 


71 extract_headword_forms( 

72 wxr, page_data, span_node.children[forms_start_index:] 

73 ) 



76def process_ja_headword( 

77 wxr: WiktextractContext, 

78 page_data: list[WordEntry], 

79 strong_node: HTMLNode, 

80) -> None: 

81 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node) 

82 form = clean_node(wxr, page_data[-1], node_without_ruby) 

83 if (len(ruby_data) > 0 or form != page_data[-1].word) and len(form) > 0: 83 ↛ exitline 83 didn't return from function 'process_ja_headword' because the condition on line 83 was always true

84 page_data[-1].forms.append( 

85 Form( 

86 form=clean_node(wxr, page_data[-1], node_without_ruby), 

87 ruby=ruby_data, 

88 tags=["canonical"], 

89 ) 

90 ) 



93def extract_headword_forms( 

94 wxr: WiktextractContext, 

95 page_data: list[WordEntry], 

96 form_nodes: list[Union[WikiNode, str]], 

97) -> None: 

98 current_nodes = [] 

99 for node in form_nodes: 

100 if isinstance(node, str) and node.startswith((",", ",")): 

101 process_forms_text(wxr, page_data, current_nodes) 

102 current_nodes = [node[1:]] 

103 else: 

104 current_nodes.append(node) 


106 if len(current_nodes) > 0: 

107 process_forms_text(wxr, page_data, current_nodes) 



110def process_forms_text( 

111 wxr: WiktextractContext, 

112 page_data: list[WordEntry], 

113 form_nodes: list[Union[WikiNode, str]], 

114) -> None: 

115 tag_nodes = [] 

116 has_forms = False 

117 striped_nodes = list(strip_nodes(form_nodes)) 

118 lang_code = page_data[-1].lang_code 

119 for index, node in enumerate(striped_nodes): 

120 if isinstance(node, WikiNode) and node.kind == NodeKind.HTML: 

121 if node.tag == "b": 

122 has_forms = True 

123 ruby_data = [] 

124 if lang_code == "ja": 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 ruby_data, node_without_ruby = extract_ruby(wxr, node) 

126 form = clean_node(wxr, None, node_without_ruby) 

127 else: 

128 form = clean_node(wxr, None, node) 

129 raw_form_tags = extract_headword_tags( 

130 clean_node(wxr, None, tag_nodes).strip("() ") 

131 ) 

132 form_tags = [] 

133 # check if next tag has gender data 

134 if index < len(striped_nodes) - 1: 

135 next_node = striped_nodes[index + 1] 

136 if ( 

137 isinstance(next_node, WikiNode) 

138 and next_node.kind == NodeKind.HTML 

139 and next_node.tag == "span" 

140 and "gender" in next_node.attrs.get("class", "") 

141 ): 

142 gender = clean_node(wxr, None, next_node) 

143 if gender in TEMPLATE_TAG_ARGS: 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true

144 form_tags.append(TEMPLATE_TAG_ARGS[gender]) 

145 else: 

146 raw_form_tags.append(gender) 


148 form_data = Form( 

149 form=form, 

150 raw_tags=raw_form_tags, 

151 tags=form_tags, 

152 ruby=ruby_data, 

153 ) 

154 translate_raw_tags(form_data) 

155 page_data[-1].forms.append(form_data) 

156 elif ( 156 ↛ 162line 156 didn't jump to line 162

157 node.tag == "span" 

158 and "tr" in node.attrs.get("class", "") 

159 and len(page_data[-1].forms) > 0 

160 ): 

161 # romanization of the previous form <b> tag 

162 page_data[-1].forms[-1].roman = clean_node(wxr, None, node) 

163 elif node.tag == "sup" and lang_code == "ja": 

164 extract_historical_kana(wxr, page_data, node) 

165 else: 

166 tag_nodes.append(node) 

167 else: 

168 tag_nodes.append(node) 


170 if not has_forms: 

171 tags_list = extract_headword_tags( 

172 clean_node(wxr, page_data[-1], tag_nodes).strip("() ") 

173 ) 

174 if len(tags_list) > 0: 

175 page_data[-1].raw_tags.extend(tags_list) 

176 translate_raw_tags(page_data[-1]) 



179def extract_headword_tags(tags_str: str) -> list[str]: 

180 tags = [] 

181 for tag_str in filter( 

182 None, (s.strip() for s in re.split("&|或|和", tags_str)) 

183 ): 

184 tags.append(tag_str) 

185 return tags 



188def extract_historical_kana( 

189 wxr: WiktextractContext, 

190 page_data: list[WordEntry], 

191 sup_node: HTMLNode, 

192) -> None: 

193 # 

194 # "hist" parameter 

195 form = "" 

196 roman = "" 

197 for strong_node in sup_node.find_html("strong"): 

198 form = clean_node(wxr, None, strong_node) 

199 for span_node in sup_node.find_html( 

200 "span", attr_name="class", attr_value="tr" 

201 ): 

202 roman = clean_node(wxr, None, span_node).strip("()") 

203 if len(form) > 0: 203 ↛ exitline 203 didn't return from function 'extract_historical_kana' because the condition on line 203 was always true

204 form_data = Form(form=form, roman=roman) 

205 page_data[-1].forms.append(form_data) 



208def extract_tlb_template( 

209 wxr: WiktextractContext, 

210 template_node: TemplateNode, 

211 page_data: list[WordEntry], 

212) -> None: 

213 # 

214 # 

215 expanded_node = wxr.wtp.parse( 

216 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

217 ) 

218 for span_tag in expanded_node.find_html_recursively( 

219 "span", attr_name="class", attr_value="ib-content" 

220 ): 

221 raw_tag = clean_node(wxr, None, span_tag) 

222 if len(raw_tag) > 0: 222 ↛ 218line 222 didn't jump to line 218 because the condition on line 222 was always true

223 page_data[-1].raw_tags.append(raw_tag) 

224 clean_node(wxr, page_data[-1], expanded_node) 

225 translate_raw_tags(page_data[-1])