Coverage for src / wiktextract / extractor / zh / headword_line.py: 92%

147 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2025-12-29 01:50 +0000

1import re 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from ..share import strip_nodes 

9from .models import Classifier, Form, WordEntry 

10from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags 

11 

12 

13def extract_pos_head_line_nodes( 

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

15) -> None: 

16 is_first_bold = True 

17 for node in nodes: 

18 if isinstance(node, TemplateNode): 

19 if node.template_name in ["tlb", "term-label"]: 

20 extract_tlb_template(wxr, word_entry, node) 

21 else: 

22 extract_headword_line_template(wxr, word_entry, node) 

23 elif ( 23 ↛ 28line 23 didn't jump to line 28 because the condition on line 23 was never true

24 isinstance(node, WikiNode) 

25 and node.kind == NodeKind.BOLD 

26 and is_first_bold 

27 ): 

28 process_headword_bold_node(wxr, word_entry, node) 

29 is_first_bold = False 

30 new_forms = [] 

31 for form in word_entry.forms: 

32 if "分類詞" in form.raw_tags: 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true

33 word_entry.classifiers.append( 

34 Classifier( 

35 classifier=form.form, tags=form.tags, raw_tags=form.raw_tags 

36 ) 

37 ) 

38 else: 

39 new_forms.append(form) 

40 word_entry.forms = new_forms 

41 translate_raw_tags(word_entry) 

42 

43 

44def extract_headword_line_template( 

45 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

46) -> None: 

47 # handle the first template in header line 

48 template_name = t_node.template_name 

49 if ( 49 ↛ 53line 49 didn't jump to line 53 because the condition on line 49 was never true

50 template_name != "head" 

51 and not template_name.startswith(f"{word_entry.lang_code}-") 

52 ) or template_name.endswith("-see"): 

53 return 

54 

55 expanded_node = wxr.wtp.parse( 

56 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

57 ) 

58 clean_node(wxr, word_entry, expanded_node) 

59 forms_start_index = 0 

60 nodes_after_span = [] 

61 for node in expanded_node.children: 

62 if not ( 

63 isinstance(node, HTMLNode) 

64 and node.tag == "span" 

65 and "headword-line" in node.attrs.get("class", "").split() 

66 ): 

67 nodes_after_span.append(node) 

68 continue 

69 for index, span_child in node.find_child(NodeKind.HTML, True): 

70 if span_child.tag == "span": 

71 forms_start_index = index + 1 

72 class_names = span_child.attrs.get("class", "").split() 

73 if "headword-tr" in class_names: 

74 form = clean_node(wxr, word_entry, span_child) 

75 if form != "": 75 ↛ 69line 75 didn't jump to line 69 because the condition on line 75 was always true

76 word_entry.forms.append( 

77 Form(form=form, tags=["romanization"]) 

78 ) 

79 elif "gender" in class_names: 

80 for abbr_tag in span_child.find_html("abbr"): 

81 gender = clean_node(wxr, None, abbr_tag) 

82 if gender in TEMPLATE_TAG_ARGS: 82 ↛ 85line 82 didn't jump to line 85 because the condition on line 82 was always true

83 word_entry.tags.append(TEMPLATE_TAG_ARGS[gender]) 

84 else: 

85 word_entry.raw_tags.append(gender) 

86 elif "ib-content" in class_names: 

87 raw_tag = clean_node(wxr, None, span_child) 

88 if raw_tag != "": 88 ↛ 69line 88 didn't jump to line 69 because the condition on line 88 was always true

89 word_entry.raw_tags.append(raw_tag) 

90 else: 

91 for strong_node in span_child.find_html( 

92 "strong", attr_name="class", attr_value="headword" 

93 ): 

94 process_headword_bold_node(wxr, word_entry, strong_node) 

95 elif ( 

96 span_child.tag == "strong" 

97 and "headword" in span_child.attrs.get("class", "") 

98 ): 

99 forms_start_index = index + 1 

100 process_headword_bold_node(wxr, word_entry, span_child) 

101 elif span_child.tag == "sup" and word_entry.lang_code == "ja": 

102 extract_historical_kana(wxr, word_entry, span_child) 

103 forms_start_index = index + 1 

104 elif span_child.tag == "i": 

105 for i_child in span_child.children: 

106 raw_tag = ( 

107 clean_node(wxr, None, i_child) 

108 .removeprefix("^†") 

109 .strip() 

110 ) 

111 if raw_tag != "": 111 ↛ 105line 111 didn't jump to line 105 because the condition on line 111 was always true

112 word_entry.raw_tags.append(raw_tag) 

113 if len(span_child.children) > 0: 

114 forms_start_index = index + 1 

115 elif span_child.tag == "b": 

116 # this is a form <b> tag, already inside form parentheses 

117 break 

118 

119 extract_headword_forms( 

120 wxr, word_entry, node.children[forms_start_index:] 

121 ) 

122 if len(nodes_after_span) > 0: 

123 extract_headword_forms(wxr, word_entry, nodes_after_span) 

124 

125 

126def process_headword_bold_node( 

127 wxr: WiktextractContext, word_entry: WordEntry, strong_node: HTMLNode 

128) -> None: 

129 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node) 

130 form = clean_node(wxr, word_entry, node_without_ruby) 

131 if (len(ruby_data) > 0 or form != word_entry.word) and len(form) > 0: 

132 if wxr.wtp.title.startswith("不支援的頁面名稱/"): 132 ↛ 136line 132 didn't jump to line 136 because the condition on line 132 was never true

133 # Unsupported titles: 

134 # https://zh.wiktionary.org/wiki/Appendix:不支援的頁面名稱 

135 # https://zh.wiktionary.org/wiki/Special:PrefixIndex/不支援的頁面名稱 

136 word_entry.word = form 

137 word_entry.original_title = wxr.wtp.title 

138 else: 

139 word_entry.forms.append( 

140 Form( 

141 form=clean_node(wxr, word_entry, node_without_ruby), 

142 ruby=ruby_data, 

143 tags=["canonical"], 

144 ) 

145 ) 

146 

147 

148def extract_headword_forms( 

149 wxr: WiktextractContext, 

150 word_entry: WordEntry, 

151 form_nodes: list[WikiNode | str], 

152) -> None: 

153 current_nodes = [] 

154 for node in form_nodes: 

155 if isinstance(node, str) and node.startswith((",", ",")): 

156 process_forms_text(wxr, word_entry, current_nodes) 

157 current_nodes = [node[1:]] 

158 else: 

159 current_nodes.append(node) 

160 

161 if len(current_nodes) > 0: 

162 process_forms_text(wxr, word_entry, current_nodes) 

163 

164 

165def process_forms_text( 

166 wxr: WiktextractContext, 

167 word_entry: WordEntry, 

168 form_nodes: list[WikiNode | str], 

169) -> None: 

170 tag_nodes = [] 

171 has_forms = False 

172 striped_nodes = list(strip_nodes(form_nodes)) 

173 for index, node in enumerate(striped_nodes): 

174 if (isinstance(node, HTMLNode) and node.tag == "b") or ( 

175 isinstance(node, WikiNode) and node.kind == NodeKind.BOLD 

176 ): 

177 has_forms = True 

178 ruby_data = [] 

179 ruby_data, node_without_ruby = extract_ruby(wxr, node) 

180 form = clean_node(wxr, None, node_without_ruby) 

181 raw_form_tags = extract_headword_tags( 

182 clean_node(wxr, None, tag_nodes).strip("() ") 

183 ) 

184 form_tags = [] 

185 # check if next tag has gender data 

186 if index < len(striped_nodes) - 1: 

187 next_node = striped_nodes[index + 1] 

188 if ( 

189 isinstance(next_node, WikiNode) 

190 and next_node.kind == NodeKind.HTML 

191 and next_node.tag == "span" 

192 and "gender" in next_node.attrs.get("class", "") 

193 ): 

194 gender = clean_node(wxr, None, next_node) 

195 if gender in TEMPLATE_TAG_ARGS: 195 ↛ 198line 195 didn't jump to line 198 because the condition on line 195 was always true

196 form_tags.append(TEMPLATE_TAG_ARGS[gender]) 

197 else: 

198 raw_form_tags.append(gender) 

199 

200 for f_str in filter(None, map(str.strip, re.split(r"/|,", form))): 

201 form_data = Form( 

202 form=f_str, 

203 raw_tags=raw_form_tags, 

204 tags=form_tags, 

205 ruby=ruby_data, 

206 ) 

207 translate_raw_tags(form_data) 

208 word_entry.forms.append(form_data) 

209 elif ( 

210 isinstance(node, HTMLNode) 

211 and node.tag == "span" 

212 and "tr" in node.attrs.get("class", "").split() 

213 and len(word_entry.forms) > 0 

214 ): 

215 # romanization of the previous form <b> tag 

216 word_entry.forms[-1].roman = clean_node(wxr, None, node) 

217 elif not ( 

218 isinstance(node, HTMLNode) 

219 and node.tag == "span" 

220 and "mention-gloss-paren" in node.attrs.get("class", "").split() 

221 ): 

222 tag_nodes.append(node) 

223 

224 if not has_forms: 

225 tags_list = extract_headword_tags( 

226 clean_node(wxr, word_entry, tag_nodes).strip("() ") 

227 ) 

228 if len(tags_list) > 0: 

229 word_entry.raw_tags.extend(tags_list) 

230 translate_raw_tags(word_entry) 

231 

232 

233def extract_headword_tags(tags_str: str) -> list[str]: 

234 tags = [] 

235 for tag_str in filter( 

236 None, (s.strip() for s in re.split("&|或|和", tags_str)) 

237 ): 

238 tags.append(tag_str) 

239 return tags 

240 

241 

242def extract_historical_kana( 

243 wxr: WiktextractContext, word_entry: WordEntry, sup_node: HTMLNode 

244) -> None: 

245 # https://zh.wiktionary.org/wiki/Template:ja-adj 

246 # "hist" parameter 

247 form = "" 

248 roman = "" 

249 for strong_node in sup_node.find_html("strong"): 

250 form = clean_node(wxr, None, strong_node) 

251 for span_node in sup_node.find_html( 

252 "span", attr_name="class", attr_value="tr" 

253 ): 

254 roman = clean_node(wxr, None, span_node).strip("()") 

255 if len(form) > 0: 255 ↛ exitline 255 didn't return from function 'extract_historical_kana' because the condition on line 255 was always true

256 word_entry.forms.append(Form(form=form, roman=roman, tags=["archaic"])) 

257 

258 

259def extract_tlb_template( 

260 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

261) -> None: 

262 # https://zh.wiktionary.org/wiki/Template:Tlb 

263 # https://en.wiktionary.org/wiki/Template:term-label 

264 expanded_node = wxr.wtp.parse( 

265 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

266 ) 

267 for span_tag in expanded_node.find_html_recursively( 

268 "span", attr_name="class", attr_value="ib-content" 

269 ): 

270 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

271 raw_tag = raw_tag.strip() 

272 if len(raw_tag) > 0: 272 ↛ 270line 272 didn't jump to line 270 because the condition on line 272 was always true

273 word_entry.raw_tags.append(raw_tag) 

274 clean_node(wxr, word_entry, expanded_node) 

275 translate_raw_tags(word_entry)