Coverage for src/wiktextract/extractor/zh/headword_line.py: 88%

130 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from ..share import strip_nodes 

9from .models import Form, WordEntry 

10from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags 

11 

12 

13def extract_pos_head_line_nodes( 

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

15) -> None: 

16 is_first_bold = True 

17 for node in nodes: 

18 if isinstance(node, TemplateNode): 

19 if node.template_name in ["tlb", "term-label"]: 19 ↛ 20line 19 didn't jump to line 20 because the condition on line 19 was never true

20 extract_tlb_template(wxr, word_entry, node) 

21 else: 

22 extract_headword_line_template(wxr, word_entry, node) 

23 elif ( 23 ↛ 28line 23 didn't jump to line 28 because the condition on line 23 was never true

24 isinstance(node, WikiNode) 

25 and node.kind == NodeKind.BOLD 

26 and is_first_bold 

27 ): 

28 process_headword_bold_node(wxr, word_entry, node) 

29 is_first_bold = False 

30 

31 

32def extract_headword_line_template( 

33 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

34) -> None: 

35 # handle the first template in header line 

36 template_name = t_node.template_name 

37 if ( 37 ↛ 41line 37 didn't jump to line 41 because the condition on line 37 was never true

38 template_name != "head" 

39 and not template_name.startswith(f"{word_entry.lang_code}-") 

40 ) or template_name.endswith("-see"): 

41 return 

42 

43 expanded_node = wxr.wtp.parse( 

44 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

45 ) 

46 clean_node(wxr, word_entry, expanded_node) 

47 forms_start_index = 0 

48 for span_node in expanded_node.find_html( 

49 "span", attr_name="class", attr_value="headword-line" 

50 ): 

51 for index, span_child in span_node.find_child(NodeKind.HTML, True): 

52 if span_child.tag == "span": 

53 forms_start_index = index + 1 

54 class_names = span_child.attrs.get("class", "") 

55 if "headword-tr" in class_names: 

56 form = clean_node(wxr, word_entry, span_child) 

57 if form != "": 57 ↛ 51line 57 didn't jump to line 51 because the condition on line 57 was always true

58 word_entry.forms.append( 

59 Form(form=form, tags=["romanization"]) 

60 ) 

61 elif "gender" in class_names: 

62 for abbr_tag in span_child.find_html("abbr"): 

63 gender = abbr_tag.children[0] 

64 if gender in TEMPLATE_TAG_ARGS: 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true

65 word_entry.tags.append(TEMPLATE_TAG_ARGS[gender]) 

66 else: 

67 word_entry.raw_tags.append(gender) 

68 translate_raw_tags(word_entry) 

69 else: 

70 for strong_node in span_child.find_html( 

71 "strong", attr_name="class", attr_value="headword" 

72 ): 

73 process_headword_bold_node(wxr, word_entry, strong_node) 

74 elif ( 

75 span_child.tag == "strong" 

76 and "headword" in span_child.attrs.get("class", "") 

77 ): 

78 forms_start_index = index + 1 

79 process_headword_bold_node(wxr, word_entry, span_child) 

80 elif span_child.tag == "b": 

81 # this is a form <b> tag, already inside form parentheses 

82 break 

83 

84 extract_headword_forms( 

85 wxr, word_entry, span_node.children[forms_start_index:] 

86 ) 

87 

88 

89def process_headword_bold_node( 

90 wxr: WiktextractContext, word_entry: WordEntry, strong_node: HTMLNode 

91) -> None: 

92 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node) 

93 form = clean_node(wxr, word_entry, node_without_ruby) 

94 if (len(ruby_data) > 0 or form != word_entry.word) and len(form) > 0: 

95 if wxr.wtp.title.startswith("不支援的頁面名稱/"): 95 ↛ 99line 95 didn't jump to line 99 because the condition on line 95 was never true

96 # Unsupported titles: 

97 # https://zh.wiktionary.org/wiki/Appendix:不支援的頁面名稱 

98 # https://zh.wiktionary.org/wiki/Special:PrefixIndex/不支援的頁面名稱 

99 word_entry.word = form 

100 word_entry.original_title = wxr.wtp.title 

101 else: 

102 word_entry.forms.append( 

103 Form( 

104 form=clean_node(wxr, word_entry, node_without_ruby), 

105 ruby=ruby_data, 

106 tags=["canonical"], 

107 ) 

108 ) 

109 

110 

111def extract_headword_forms( 

112 wxr: WiktextractContext, 

113 word_entry: WordEntry, 

114 form_nodes: list[WikiNode | str], 

115) -> None: 

116 current_nodes = [] 

117 for node in form_nodes: 

118 if isinstance(node, str) and node.startswith((",", ",")): 

119 process_forms_text(wxr, word_entry, current_nodes) 

120 current_nodes = [node[1:]] 

121 else: 

122 current_nodes.append(node) 

123 

124 if len(current_nodes) > 0: 

125 process_forms_text(wxr, word_entry, current_nodes) 

126 

127 

128def process_forms_text( 

129 wxr: WiktextractContext, 

130 word_entry: WordEntry, 

131 form_nodes: list[WikiNode | str], 

132) -> None: 

133 tag_nodes = [] 

134 has_forms = False 

135 striped_nodes = list(strip_nodes(form_nodes)) 

136 lang_code = word_entry.lang_code 

137 for index, node in enumerate(striped_nodes): 

138 if isinstance(node, WikiNode) and node.kind == NodeKind.HTML: 

139 if node.tag == "b": 

140 has_forms = True 

141 ruby_data = [] 

142 if lang_code == "ja": 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 ruby_data, node_without_ruby = extract_ruby(wxr, node) 

144 form = clean_node(wxr, None, node_without_ruby) 

145 else: 

146 form = clean_node(wxr, None, node) 

147 raw_form_tags = extract_headword_tags( 

148 clean_node(wxr, None, tag_nodes).strip("() ") 

149 ) 

150 form_tags = [] 

151 # check if next tag has gender data 

152 if index < len(striped_nodes) - 1: 

153 next_node = striped_nodes[index + 1] 

154 if ( 

155 isinstance(next_node, WikiNode) 

156 and next_node.kind == NodeKind.HTML 

157 and next_node.tag == "span" 

158 and "gender" in next_node.attrs.get("class", "") 

159 ): 

160 gender = clean_node(wxr, None, next_node) 

161 if gender in TEMPLATE_TAG_ARGS: 161 ↛ 164line 161 didn't jump to line 164 because the condition on line 161 was always true

162 form_tags.append(TEMPLATE_TAG_ARGS[gender]) 

163 else: 

164 raw_form_tags.append(gender) 

165 

166 for f_str in form.split("/"): 

167 f_str = f_str.strip() 

168 if f_str == "": 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 continue 

170 form_data = Form( 

171 form=f_str, 

172 raw_tags=raw_form_tags, 

173 tags=form_tags, 

174 ruby=ruby_data, 

175 ) 

176 translate_raw_tags(form_data) 

177 word_entry.forms.append(form_data) 

178 elif ( 178 ↛ 184line 178 didn't jump to line 184 because the condition on line 178 was never true

179 node.tag == "span" 

180 and "tr" in node.attrs.get("class", "") 

181 and len(word_entry.forms) > 0 

182 ): 

183 # romanization of the previous form <b> tag 

184 word_entry.forms[-1].roman = clean_node(wxr, None, node) 

185 elif node.tag == "sup" and lang_code == "ja": 

186 extract_historical_kana(wxr, word_entry, node) 

187 else: 

188 tag_nodes.append(node) 

189 else: 

190 tag_nodes.append(node) 

191 

192 if not has_forms: 

193 tags_list = extract_headword_tags( 

194 clean_node(wxr, word_entry, tag_nodes).strip("() ") 

195 ) 

196 if len(tags_list) > 0: 

197 word_entry.raw_tags.extend(tags_list) 

198 translate_raw_tags(word_entry) 

199 

200 

201def extract_headword_tags(tags_str: str) -> list[str]: 

202 tags = [] 

203 for tag_str in filter( 

204 None, (s.strip() for s in re.split("&|或|和", tags_str)) 

205 ): 

206 tags.append(tag_str) 

207 return tags 

208 

209 

210def extract_historical_kana( 

211 wxr: WiktextractContext, word_entry: WordEntry, sup_node: HTMLNode 

212) -> None: 

213 # https://zh.wiktionary.org/wiki/Template:ja-adj 

214 # "hist" parameter 

215 form = "" 

216 roman = "" 

217 for strong_node in sup_node.find_html("strong"): 

218 form = clean_node(wxr, None, strong_node) 

219 for span_node in sup_node.find_html( 

220 "span", attr_name="class", attr_value="tr" 

221 ): 

222 roman = clean_node(wxr, None, span_node).strip("()") 

223 if len(form) > 0: 223 ↛ exitline 223 didn't return from function 'extract_historical_kana' because the condition on line 223 was always true

224 form_data = Form(form=form, roman=roman) 

225 word_entry.forms.append(form_data) 

226 

227 

228def extract_tlb_template( 

229 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

230) -> None: 

231 # https://zh.wiktionary.org/wiki/Template:Tlb 

232 # https://en.wiktionary.org/wiki/Template:term-label 

233 expanded_node = wxr.wtp.parse( 

234 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

235 ) 

236 for span_tag in expanded_node.find_html_recursively( 

237 "span", attr_name="class", attr_value="ib-content" 

238 ): 

239 raw_tag = clean_node(wxr, None, span_tag) 

240 if len(raw_tag) > 0: 240 ↛ 236line 240 didn't jump to line 236 because the condition on line 240 was always true

241 word_entry.raw_tags.append(raw_tag) 

242 clean_node(wxr, word_entry, expanded_node) 

243 translate_raw_tags(word_entry)