Coverage for src/wiktextract/extractor/zh/headword

1import re

2from typing import Union

4from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from ..ruby import extract_ruby

9from ..share import strip_nodes

10from .models import Form, WordEntry

11from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags

14def extract_headword_line_template(

15 wxr: WiktextractContext,

16 page_data: list[WordEntry],

17 node: TemplateNode,

18 lang_code: str,

19) -> None:

20 # handle the first template in header line

21 template_name = node.template_name

22 if ( 22 ↛ 26line 22 didn't jump to line 26 because the condition on line 22 was never true

23 template_name != "head"

24 and not template_name.startswith(f"{lang_code}-")

25 ) or template_name.endswith("-see"):

26 return

28 expanded_node = wxr.wtp.parse(

29 wxr.wtp.node_to_wikitext(node), expand_all=True

30 )

31 clean_node(wxr, page_data[-1], expanded_node)

32 forms_start_index = 0

33 for span_node in expanded_node.find_html(

34 "span", attr_name="class", attr_value="headword-line"

35 ):

36 for index, span_child in span_node.find_child(NodeKind.HTML, True):

37 if span_child.tag == "span":

38 forms_start_index = index + 1

39 class_names = span_child.attrs.get("class", "")

40 if "headword-tr" in class_names:

41 page_data[-1].forms.append(

42 Form(

43 form=clean_node(wxr, page_data[-1], span_child),

44 tags=["romanization"],

45 )

46 )

47 elif "gender" in class_names:

48 for abbr_tag in span_child.find_html("abbr"):

49 gender = abbr_tag.children[0]

50 if gender in TEMPLATE_TAG_ARGS: 50 ↛ 53line 50 didn't jump to line 53 because the condition on line 50 was always true

51 page_data[-1].tags.append(TEMPLATE_TAG_ARGS[gender])

52 else:

53 page_data[-1].raw_tags.append(gender)

54 translate_raw_tags(page_data[-1])

55 else:

56 for strong_node in span_child.find_html(

57 "strong", attr_name="class", attr_value="headword"

58 ):

59 process_ja_headword(wxr, page_data, strong_node)

60 elif (

61 span_child.tag == "strong"

62 and "headword" in span_child.attrs.get("class", "")

63 ):

64 forms_start_index = index + 1

65 if lang_code == "ja": 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true

66 process_ja_headword(wxr, page_data, span_child)

67 elif span_child.tag == "b":

68 # this is a form <b> tag, already inside form parentheses

69 break

71 extract_headword_forms(

72 wxr, page_data, span_node.children[forms_start_index:]

73 )

76def process_ja_headword(

77 wxr: WiktextractContext,

78 page_data: list[WordEntry],

79 strong_node: HTMLNode,

80) -> None:

81 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node)

82 form = clean_node(wxr, page_data[-1], node_without_ruby)

83 if (len(ruby_data) > 0 or form != page_data[-1].word) and len(form) > 0: 83 ↛ exitline 83 didn't return from function 'process_ja_headword' because the condition on line 83 was always true

84 page_data[-1].forms.append(

85 Form(

86 form=clean_node(wxr, page_data[-1], node_without_ruby),

87 ruby=ruby_data,

88 tags=["canonical"],

89 )

90 )

93def extract_headword_forms(

94 wxr: WiktextractContext,

95 page_data: list[WordEntry],

96 form_nodes: list[Union[WikiNode, str]],

97) -> None:

98 current_nodes = []

99 for node in form_nodes:

100 if isinstance(node, str) and node.startswith(("，", ",")):

101 process_forms_text(wxr, page_data, current_nodes)

102 current_nodes = [node[1:]]

103 else:

104 current_nodes.append(node)

105

106 if len(current_nodes) > 0:

107 process_forms_text(wxr, page_data, current_nodes)

108

109

110def process_forms_text(

111 wxr: WiktextractContext,

112 page_data: list[WordEntry],

113 form_nodes: list[Union[WikiNode, str]],

114) -> None:

115 tag_nodes = []

116 has_forms = False

117 striped_nodes = list(strip_nodes(form_nodes))

118 lang_code = page_data[-1].lang_code

119 for index, node in enumerate(striped_nodes):

120 if isinstance(node, WikiNode) and node.kind == NodeKind.HTML:

121 if node.tag == "b":

122 has_forms = True

123 ruby_data = []

124 if lang_code == "ja": 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 ruby_data, node_without_ruby = extract_ruby(wxr, node)

126 form = clean_node(wxr, None, node_without_ruby)

127 else:

128 form = clean_node(wxr, None, node)

129 raw_form_tags = extract_headword_tags(

130 clean_node(wxr, None, tag_nodes).strip("() ")

131 )

132 form_tags = []

133 # check if next tag has gender data

134 if index < len(striped_nodes) - 1:

135 next_node = striped_nodes[index + 1]

136 if (

137 isinstance(next_node, WikiNode)

138 and next_node.kind == NodeKind.HTML

139 and next_node.tag == "span"

140 and "gender" in next_node.attrs.get("class", "")

141 ):

142 gender = clean_node(wxr, None, next_node)

143 if gender in TEMPLATE_TAG_ARGS: 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true

144 form_tags.append(TEMPLATE_TAG_ARGS[gender])

145 else:

146 raw_form_tags.append(gender)

147

148 form_data = Form(

149 form=form,

150 raw_tags=raw_form_tags,

151 tags=form_tags,

152 ruby=ruby_data,

153 )

154 translate_raw_tags(form_data)

155 page_data[-1].forms.append(form_data)

156 elif ( 156 ↛ 162line 156 didn't jump to line 162 because the condition on line 156 was never true

157 node.tag == "span"

158 and "tr" in node.attrs.get("class", "")

159 and len(page_data[-1].forms) > 0

160 ):

161 # romanization of the previous form <b> tag

162 page_data[-1].forms[-1].roman = clean_node(wxr, None, node)

163 elif node.tag == "sup" and lang_code == "ja":

164 extract_historical_kana(wxr, page_data, node)

165 else:

166 tag_nodes.append(node)

167 else:

168 tag_nodes.append(node)

169

170 if not has_forms:

171 tags_list = extract_headword_tags(

172 clean_node(wxr, page_data[-1], tag_nodes).strip("() ")

173 )

174 if len(tags_list) > 0:

175 page_data[-1].raw_tags.extend(tags_list)

176 translate_raw_tags(page_data[-1])

177

178

179def extract_headword_tags(tags_str: str) -> list[str]:

180 tags = []

181 for tag_str in filter(

182 None, (s.strip() for s in re.split("&|或|和", tags_str))

183 ):

184 tags.append(tag_str)

185 return tags

186

187

188def extract_historical_kana(

189 wxr: WiktextractContext,

190 page_data: list[WordEntry],

191 sup_node: HTMLNode,

192) -> None:

193 # https://zh.wiktionary.org/wiki/Template:ja-adj

194 # "hist" parameter

195 form = ""

196 roman = ""

197 for strong_node in sup_node.find_html("strong"):

198 form = clean_node(wxr, None, strong_node)

199 for span_node in sup_node.find_html(

200 "span", attr_name="class", attr_value="tr"

201 ):

202 roman = clean_node(wxr, None, span_node).strip("()")

203 if len(form) > 0: 203 ↛ exitline 203 didn't return from function 'extract_historical_kana' because the condition on line 203 was always true

204 form_data = Form(form=form, roman=roman)

205 page_data[-1].forms.append(form_data)

206

207

208def extract_tlb_template(

209 wxr: WiktextractContext,

210 template_node: TemplateNode,

211 page_data: list[WordEntry],

212) -> None:

213 # https://zh.wiktionary.org/wiki/Template:Tlb

214 # https://en.wiktionary.org/wiki/Template:term-label

215 expanded_node = wxr.wtp.parse(

216 wxr.wtp.node_to_wikitext(template_node), expand_all=True

217 )

218 for span_tag in expanded_node.find_html_recursively(

219 "span", attr_name="class", attr_value="ib-content"

220 ):

221 raw_tag = clean_node(wxr, None, span_tag)

222 if len(raw_tag) > 0: 222 ↛ 218line 222 didn't jump to line 218 because the condition on line 222 was always true

223 page_data[-1].raw_tags.append(raw_tag)

224 clean_node(wxr, page_data[-1], expanded_node)

225 translate_raw_tags(page_data[-1])

Coverage for src/wiktextract/extractor/zh/headword_line.py: 91%

113 statements