Coverage for src/wiktextract/extractor/zh/headword_line.py: 90%

136 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from ..share import strip_nodes 

9from .models import Form, WordEntry 

10from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags 

11 

12 

13def extract_pos_head_line_nodes( 

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

15) -> None: 

16 is_first_bold = True 

17 for node in nodes: 

18 if isinstance(node, TemplateNode): 

19 if node.template_name in ["tlb", "term-label"]: 

20 extract_tlb_template(wxr, word_entry, node) 

21 else: 

22 extract_headword_line_template(wxr, word_entry, node) 

23 elif ( 23 ↛ 28line 23 didn't jump to line 28 because the condition on line 23 was never true

24 isinstance(node, WikiNode) 

25 and node.kind == NodeKind.BOLD 

26 and is_first_bold 

27 ): 

28 process_headword_bold_node(wxr, word_entry, node) 

29 is_first_bold = False 

30 

31 

32def extract_headword_line_template( 

33 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

34) -> None: 

35 # handle the first template in header line 

36 template_name = t_node.template_name 

37 if ( 37 ↛ 41line 37 didn't jump to line 41 because the condition on line 37 was never true

38 template_name != "head" 

39 and not template_name.startswith(f"{word_entry.lang_code}-") 

40 ) or template_name.endswith("-see"): 

41 return 

42 

43 expanded_node = wxr.wtp.parse( 

44 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

45 ) 

46 clean_node(wxr, word_entry, expanded_node) 

47 forms_start_index = 0 

48 for span_node in expanded_node.find_html( 

49 "span", attr_name="class", attr_value="headword-line" 

50 ): 

51 for index, span_child in span_node.find_child(NodeKind.HTML, True): 

52 if span_child.tag == "span": 

53 forms_start_index = index + 1 

54 class_names = span_child.attrs.get("class", "").split() 

55 if "headword-tr" in class_names: 

56 form = clean_node(wxr, word_entry, span_child) 

57 if form != "": 57 ↛ 51line 57 didn't jump to line 51 because the condition on line 57 was always true

58 word_entry.forms.append( 

59 Form(form=form, tags=["romanization"]) 

60 ) 

61 elif "gender" in class_names: 

62 for abbr_tag in span_child.find_html("abbr"): 

63 gender = clean_node(wxr, None, abbr_tag) 

64 if gender in TEMPLATE_TAG_ARGS: 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true

65 word_entry.tags.append(TEMPLATE_TAG_ARGS[gender]) 

66 else: 

67 word_entry.raw_tags.append(gender) 

68 translate_raw_tags(word_entry) 

69 elif "ib-content" in class_names: 

70 raw_tag = clean_node(wxr, None, span_child) 

71 if raw_tag != "": 71 ↛ 51line 71 didn't jump to line 51 because the condition on line 71 was always true

72 word_entry.raw_tags.append(raw_tag) 

73 translate_raw_tags(word_entry) 

74 else: 

75 for strong_node in span_child.find_html( 

76 "strong", attr_name="class", attr_value="headword" 

77 ): 

78 process_headword_bold_node(wxr, word_entry, strong_node) 

79 elif ( 

80 span_child.tag == "strong" 

81 and "headword" in span_child.attrs.get("class", "") 

82 ): 

83 forms_start_index = index + 1 

84 process_headword_bold_node(wxr, word_entry, span_child) 

85 elif span_child.tag == "b": 

86 # this is a form <b> tag, already inside form parentheses 

87 break 

88 

89 extract_headword_forms( 

90 wxr, word_entry, span_node.children[forms_start_index:] 

91 ) 

92 

93 

94def process_headword_bold_node( 

95 wxr: WiktextractContext, word_entry: WordEntry, strong_node: HTMLNode 

96) -> None: 

97 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node) 

98 form = clean_node(wxr, word_entry, node_without_ruby) 

99 if (len(ruby_data) > 0 or form != word_entry.word) and len(form) > 0: 

100 if wxr.wtp.title.startswith("不支援的頁面名稱/"): 100 ↛ 104line 100 didn't jump to line 104 because the condition on line 100 was never true

101 # Unsupported titles: 

102 # https://zh.wiktionary.org/wiki/Appendix:不支援的頁面名稱 

103 # https://zh.wiktionary.org/wiki/Special:PrefixIndex/不支援的頁面名稱 

104 word_entry.word = form 

105 word_entry.original_title = wxr.wtp.title 

106 else: 

107 word_entry.forms.append( 

108 Form( 

109 form=clean_node(wxr, word_entry, node_without_ruby), 

110 ruby=ruby_data, 

111 tags=["canonical"], 

112 ) 

113 ) 

114 

115 

116def extract_headword_forms( 

117 wxr: WiktextractContext, 

118 word_entry: WordEntry, 

119 form_nodes: list[WikiNode | str], 

120) -> None: 

121 current_nodes = [] 

122 for node in form_nodes: 

123 if isinstance(node, str) and node.startswith((",", ",")): 

124 process_forms_text(wxr, word_entry, current_nodes) 

125 current_nodes = [node[1:]] 

126 else: 

127 current_nodes.append(node) 

128 

129 if len(current_nodes) > 0: 

130 process_forms_text(wxr, word_entry, current_nodes) 

131 

132 

133def process_forms_text( 

134 wxr: WiktextractContext, 

135 word_entry: WordEntry, 

136 form_nodes: list[WikiNode | str], 

137) -> None: 

138 tag_nodes = [] 

139 has_forms = False 

140 striped_nodes = list(strip_nodes(form_nodes)) 

141 lang_code = word_entry.lang_code 

142 for index, node in enumerate(striped_nodes): 

143 if isinstance(node, WikiNode) and node.kind == NodeKind.HTML: 

144 if node.tag == "b": 

145 has_forms = True 

146 ruby_data = [] 

147 if lang_code == "ja": 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 ruby_data, node_without_ruby = extract_ruby(wxr, node) 

149 form = clean_node(wxr, None, node_without_ruby) 

150 else: 

151 form = clean_node(wxr, None, node) 

152 raw_form_tags = extract_headword_tags( 

153 clean_node(wxr, None, tag_nodes).strip("() ") 

154 ) 

155 form_tags = [] 

156 # check if next tag has gender data 

157 if index < len(striped_nodes) - 1: 

158 next_node = striped_nodes[index + 1] 

159 if ( 

160 isinstance(next_node, WikiNode) 

161 and next_node.kind == NodeKind.HTML 

162 and next_node.tag == "span" 

163 and "gender" in next_node.attrs.get("class", "") 

164 ): 

165 gender = clean_node(wxr, None, next_node) 

166 if gender in TEMPLATE_TAG_ARGS: 166 ↛ 169line 166 didn't jump to line 169 because the condition on line 166 was always true

167 form_tags.append(TEMPLATE_TAG_ARGS[gender]) 

168 else: 

169 raw_form_tags.append(gender) 

170 

171 for f_str in form.split("/"): 

172 f_str = f_str.strip() 

173 if f_str == "": 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 continue 

175 form_data = Form( 

176 form=f_str, 

177 raw_tags=raw_form_tags, 

178 tags=form_tags, 

179 ruby=ruby_data, 

180 ) 

181 translate_raw_tags(form_data) 

182 word_entry.forms.append(form_data) 

183 elif ( 

184 node.tag == "span" 

185 and "tr" in node.attrs.get("class", "") 

186 and len(word_entry.forms) > 0 

187 ): 

188 # romanization of the previous form <b> tag 

189 word_entry.forms[-1].roman = clean_node(wxr, None, node) 

190 elif node.tag == "sup" and lang_code == "ja": 

191 extract_historical_kana(wxr, word_entry, node) 

192 else: 

193 tag_nodes.append(node) 

194 else: 

195 tag_nodes.append(node) 

196 

197 if not has_forms: 

198 tags_list = extract_headword_tags( 

199 clean_node(wxr, word_entry, tag_nodes).strip("() ") 

200 ) 

201 if len(tags_list) > 0: 

202 word_entry.raw_tags.extend(tags_list) 

203 translate_raw_tags(word_entry) 

204 

205 

206def extract_headword_tags(tags_str: str) -> list[str]: 

207 tags = [] 

208 for tag_str in filter( 

209 None, (s.strip() for s in re.split("&|或|和", tags_str)) 

210 ): 

211 tags.append(tag_str) 

212 return tags 

213 

214 

215def extract_historical_kana( 

216 wxr: WiktextractContext, word_entry: WordEntry, sup_node: HTMLNode 

217) -> None: 

218 # https://zh.wiktionary.org/wiki/Template:ja-adj 

219 # "hist" parameter 

220 form = "" 

221 roman = "" 

222 for strong_node in sup_node.find_html("strong"): 

223 form = clean_node(wxr, None, strong_node) 

224 for span_node in sup_node.find_html( 

225 "span", attr_name="class", attr_value="tr" 

226 ): 

227 roman = clean_node(wxr, None, span_node).strip("()") 

228 if len(form) > 0: 228 ↛ exitline 228 didn't return from function 'extract_historical_kana' because the condition on line 228 was always true

229 form_data = Form(form=form, roman=roman) 

230 word_entry.forms.append(form_data) 

231 

232 

233def extract_tlb_template( 

234 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

235) -> None: 

236 # https://zh.wiktionary.org/wiki/Template:Tlb 

237 # https://en.wiktionary.org/wiki/Template:term-label 

238 expanded_node = wxr.wtp.parse( 

239 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

240 ) 

241 for span_tag in expanded_node.find_html_recursively( 

242 "span", attr_name="class", attr_value="ib-content" 

243 ): 

244 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

245 raw_tag = raw_tag.strip() 

246 if len(raw_tag) > 0: 246 ↛ 244line 246 didn't jump to line 244 because the condition on line 246 was always true

247 word_entry.raw_tags.append(raw_tag) 

248 clean_node(wxr, word_entry, expanded_node) 

249 translate_raw_tags(word_entry)