Coverage for src / wiktextract / extractor / zh / headword_line.py: 93%

141 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1import re 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from ..share import strip_nodes 

9from .models import Form, WordEntry 

10from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags 

11 

12 

13def extract_pos_head_line_nodes( 

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

15) -> None: 

16 is_first_bold = True 

17 for node in nodes: 

18 if isinstance(node, TemplateNode): 

19 if node.template_name in ["tlb", "term-label"]: 

20 extract_tlb_template(wxr, word_entry, node) 

21 else: 

22 extract_headword_line_template(wxr, word_entry, node) 

23 elif ( 23 ↛ 28line 23 didn't jump to line 28 because the condition on line 23 was never true

24 isinstance(node, WikiNode) 

25 and node.kind == NodeKind.BOLD 

26 and is_first_bold 

27 ): 

28 process_headword_bold_node(wxr, word_entry, node) 

29 is_first_bold = False 

30 translate_raw_tags(word_entry) 

31 

32 

33def extract_headword_line_template( 

34 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

35) -> None: 

36 # handle the first template in header line 

37 template_name = t_node.template_name 

38 if ( 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was never true

39 template_name != "head" 

40 and not template_name.startswith(f"{word_entry.lang_code}-") 

41 ) or template_name.endswith("-see"): 

42 return 

43 

44 expanded_node = wxr.wtp.parse( 

45 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

46 ) 

47 clean_node(wxr, word_entry, expanded_node) 

48 forms_start_index = 0 

49 nodes_after_span = [] 

50 for node in expanded_node.children: 

51 if not ( 

52 isinstance(node, HTMLNode) 

53 and node.tag == "span" 

54 and "headword-line" in node.attrs.get("class", "").split() 

55 ): 

56 nodes_after_span.append(node) 

57 continue 

58 for index, span_child in node.find_child(NodeKind.HTML, True): 

59 if span_child.tag == "span": 

60 forms_start_index = index + 1 

61 class_names = span_child.attrs.get("class", "").split() 

62 if "headword-tr" in class_names: 

63 form = clean_node(wxr, word_entry, span_child) 

64 if form != "": 64 ↛ 58line 64 didn't jump to line 58 because the condition on line 64 was always true

65 word_entry.forms.append( 

66 Form(form=form, tags=["romanization"]) 

67 ) 

68 elif "gender" in class_names: 

69 for abbr_tag in span_child.find_html("abbr"): 

70 gender = clean_node(wxr, None, abbr_tag) 

71 if gender in TEMPLATE_TAG_ARGS: 71 ↛ 74line 71 didn't jump to line 74 because the condition on line 71 was always true

72 word_entry.tags.append(TEMPLATE_TAG_ARGS[gender]) 

73 else: 

74 word_entry.raw_tags.append(gender) 

75 elif "ib-content" in class_names: 

76 raw_tag = clean_node(wxr, None, span_child) 

77 if raw_tag != "": 77 ↛ 58line 77 didn't jump to line 58 because the condition on line 77 was always true

78 word_entry.raw_tags.append(raw_tag) 

79 else: 

80 for strong_node in span_child.find_html( 

81 "strong", attr_name="class", attr_value="headword" 

82 ): 

83 process_headword_bold_node(wxr, word_entry, strong_node) 

84 elif ( 

85 span_child.tag == "strong" 

86 and "headword" in span_child.attrs.get("class", "") 

87 ): 

88 forms_start_index = index + 1 

89 process_headword_bold_node(wxr, word_entry, span_child) 

90 elif span_child.tag == "sup" and word_entry.lang_code == "ja": 

91 extract_historical_kana(wxr, word_entry, span_child) 

92 forms_start_index = index + 1 

93 elif span_child.tag == "i": 

94 for i_child in span_child.children: 

95 raw_tag = ( 

96 clean_node(wxr, None, i_child) 

97 .removeprefix("^†") 

98 .strip() 

99 ) 

100 if raw_tag != "": 100 ↛ 94line 100 didn't jump to line 94 because the condition on line 100 was always true

101 word_entry.raw_tags.append(raw_tag) 

102 if len(span_child.children) > 0: 

103 forms_start_index = index + 1 

104 elif span_child.tag == "b": 

105 # this is a form <b> tag, already inside form parentheses 

106 break 

107 

108 extract_headword_forms( 

109 wxr, word_entry, node.children[forms_start_index:] 

110 ) 

111 if len(nodes_after_span) > 0: 

112 extract_headword_forms(wxr, word_entry, nodes_after_span) 

113 

114 

115def process_headword_bold_node( 

116 wxr: WiktextractContext, word_entry: WordEntry, strong_node: HTMLNode 

117) -> None: 

118 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node) 

119 form = clean_node(wxr, word_entry, node_without_ruby) 

120 if (len(ruby_data) > 0 or form != word_entry.word) and len(form) > 0: 

121 if wxr.wtp.title.startswith("不支援的頁面名稱/"): 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was never true

122 # Unsupported titles: 

123 # https://zh.wiktionary.org/wiki/Appendix:不支援的頁面名稱 

124 # https://zh.wiktionary.org/wiki/Special:PrefixIndex/不支援的頁面名稱 

125 word_entry.word = form 

126 word_entry.original_title = wxr.wtp.title 

127 else: 

128 word_entry.forms.append( 

129 Form( 

130 form=clean_node(wxr, word_entry, node_without_ruby), 

131 ruby=ruby_data, 

132 tags=["canonical"], 

133 ) 

134 ) 

135 

136 

137def extract_headword_forms( 

138 wxr: WiktextractContext, 

139 word_entry: WordEntry, 

140 form_nodes: list[WikiNode | str], 

141) -> None: 

142 current_nodes = [] 

143 for node in form_nodes: 

144 if isinstance(node, str) and node.startswith((",", ",")): 

145 process_forms_text(wxr, word_entry, current_nodes) 

146 current_nodes = [node[1:]] 

147 else: 

148 current_nodes.append(node) 

149 

150 if len(current_nodes) > 0: 

151 process_forms_text(wxr, word_entry, current_nodes) 

152 

153 

154def process_forms_text( 

155 wxr: WiktextractContext, 

156 word_entry: WordEntry, 

157 form_nodes: list[WikiNode | str], 

158) -> None: 

159 tag_nodes = [] 

160 has_forms = False 

161 striped_nodes = list(strip_nodes(form_nodes)) 

162 for index, node in enumerate(striped_nodes): 

163 if (isinstance(node, HTMLNode) and node.tag == "b") or ( 

164 isinstance(node, WikiNode) and node.kind == NodeKind.BOLD 

165 ): 

166 has_forms = True 

167 ruby_data = [] 

168 ruby_data, node_without_ruby = extract_ruby(wxr, node) 

169 form = clean_node(wxr, None, node_without_ruby) 

170 raw_form_tags = extract_headword_tags( 

171 clean_node(wxr, None, tag_nodes).strip("() ") 

172 ) 

173 form_tags = [] 

174 # check if next tag has gender data 

175 if index < len(striped_nodes) - 1: 

176 next_node = striped_nodes[index + 1] 

177 if ( 

178 isinstance(next_node, WikiNode) 

179 and next_node.kind == NodeKind.HTML 

180 and next_node.tag == "span" 

181 and "gender" in next_node.attrs.get("class", "") 

182 ): 

183 gender = clean_node(wxr, None, next_node) 

184 if gender in TEMPLATE_TAG_ARGS: 184 ↛ 187line 184 didn't jump to line 187 because the condition on line 184 was always true

185 form_tags.append(TEMPLATE_TAG_ARGS[gender]) 

186 else: 

187 raw_form_tags.append(gender) 

188 

189 for f_str in filter(None, map(str.strip, re.split(r"/|,", form))): 

190 form_data = Form( 

191 form=f_str, 

192 raw_tags=raw_form_tags, 

193 tags=form_tags, 

194 ruby=ruby_data, 

195 ) 

196 translate_raw_tags(form_data) 

197 word_entry.forms.append(form_data) 

198 elif ( 

199 isinstance(node, HTMLNode) 

200 and node.tag == "span" 

201 and "tr" in node.attrs.get("class", "").split() 

202 and len(word_entry.forms) > 0 

203 ): 

204 # romanization of the previous form <b> tag 

205 word_entry.forms[-1].roman = clean_node(wxr, None, node) 

206 elif not ( 

207 isinstance(node, HTMLNode) 

208 and node.tag == "span" 

209 and "mention-gloss-paren" in node.attrs.get("class", "").split() 

210 ): 

211 tag_nodes.append(node) 

212 

213 if not has_forms: 

214 tags_list = extract_headword_tags( 

215 clean_node(wxr, word_entry, tag_nodes).strip("() ") 

216 ) 

217 if len(tags_list) > 0: 

218 word_entry.raw_tags.extend(tags_list) 

219 translate_raw_tags(word_entry) 

220 

221 

222def extract_headword_tags(tags_str: str) -> list[str]: 

223 tags = [] 

224 for tag_str in filter( 

225 None, (s.strip() for s in re.split("&|或|和", tags_str)) 

226 ): 

227 tags.append(tag_str) 

228 return tags 

229 

230 

231def extract_historical_kana( 

232 wxr: WiktextractContext, word_entry: WordEntry, sup_node: HTMLNode 

233) -> None: 

234 # https://zh.wiktionary.org/wiki/Template:ja-adj 

235 # "hist" parameter 

236 form = "" 

237 roman = "" 

238 for strong_node in sup_node.find_html("strong"): 

239 form = clean_node(wxr, None, strong_node) 

240 for span_node in sup_node.find_html( 

241 "span", attr_name="class", attr_value="tr" 

242 ): 

243 roman = clean_node(wxr, None, span_node).strip("()") 

244 if len(form) > 0: 244 ↛ exitline 244 didn't return from function 'extract_historical_kana' because the condition on line 244 was always true

245 word_entry.forms.append(Form(form=form, roman=roman, tags=["archaic"])) 

246 

247 

248def extract_tlb_template( 

249 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

250) -> None: 

251 # https://zh.wiktionary.org/wiki/Template:Tlb 

252 # https://en.wiktionary.org/wiki/Template:term-label 

253 expanded_node = wxr.wtp.parse( 

254 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

255 ) 

256 for span_tag in expanded_node.find_html_recursively( 

257 "span", attr_name="class", attr_value="ib-content" 

258 ): 

259 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

260 raw_tag = raw_tag.strip() 

261 if len(raw_tag) > 0: 261 ↛ 259line 261 didn't jump to line 259 because the condition on line 261 was always true

262 word_entry.raw_tags.append(raw_tag) 

263 clean_node(wxr, word_entry, expanded_node) 

264 translate_raw_tags(word_entry)