Coverage for src/wiktextract/extractor/vi/pos.py: 54%

157 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .example import extract_example_list_item 

15from .models import AltForm, Classifier, Form, Sense, WordEntry 

16from .section_titles import POS_DATA 

17from .tags import translate_raw_tags 

18 

19 

20def extract_pos_section( 

21 wxr: WiktextractContext, 

22 page_data: list[WordEntry], 

23 base_data: WordEntry, 

24 level_node: LevelNode, 

25 pos_title: str, 

26): 

27 page_data.append(base_data.model_copy(deep=True)) 

28 page_data[-1].pos_title = pos_title 

29 pos_data = POS_DATA[pos_title] 

30 page_data[-1].pos = pos_data["pos"] 

31 base_data.pos = pos_data["pos"] 

32 page_data[-1].tags.extend(pos_data.get("tags", [])) 

33 

34 gloss_list_index = len(level_node.children) 

35 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

36 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

37 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 

38 extract_gloss_list_item(wxr, page_data[-1], list_item) 

39 if index < gloss_list_index: 39 ↛ 36line 39 didn't jump to line 36 because the condition on line 39 was always true

40 gloss_list_index = index 

41 

42 for node in level_node.children[:gloss_list_index]: 

43 if isinstance(node, TemplateNode): 

44 extract_headword_template(wxr, page_data[-1], node) 

45 

46 

47# redirect 

48ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "vi-alt sp", "vie-alt sp"]) 

49FORM_OF_TEMPLATES = frozenset(["số nhiều của", "short for"]) 

50 

51 

52def extract_gloss_list_item( 

53 wxr: WiktextractContext, 

54 word_entry: WordEntry, 

55 list_item: WikiNode, 

56 parent_sense: Sense | None = None, 

57): 

58 sense = ( 

59 parent_sense.model_copy(deep=True) 

60 if parent_sense is not None 

61 else Sense() 

62 ) 

63 sense.examples.clear() 

64 gloss_nodes = [] 

65 for node in list_item.children: 

66 if isinstance(node, TemplateNode): 

67 if node.template_name in ["nhãn", "label", "def-lb", "context"]: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 extract_label_template(wxr, sense, node) 

69 elif node.template_name == "term": 

70 extract_term_template(wxr, sense, node) 

71 elif ( 71 ↛ 78line 71 didn't jump to line 78 because the condition on line 71 was always true

72 node.template_name.endswith((" of", "-of")) 

73 or node.template_name in ALT_OF_TEMPLATES 

74 or node.template_name in FORM_OF_TEMPLATES 

75 ): 

76 extract_form_of_template(wxr, sense, node) 

77 gloss_nodes.append(node) 

78 elif node.template_name == "@": 

79 extract_at_template(wxr, sense, node) 

80 elif node.template_name in ["zho-mw", "zh-mw"]: 

81 extract_zh_mw_template(wxr, node, sense) 

82 else: 

83 gloss_nodes.append(node) 

84 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

85 gloss_nodes.append(node) 

86 gloss_str = clean_node(wxr, sense, gloss_nodes) 

87 if gloss_str != "": 87 ↛ 92line 87 didn't jump to line 92 because the condition on line 87 was always true

88 sense.glosses.append(gloss_str) 

89 translate_raw_tags(sense) 

90 word_entry.senses.append(sense) 

91 

92 for child_list in list_item.find_child(NodeKind.LIST): 

93 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

94 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

95 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

96 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 96 ↛ 92line 96 didn't jump to line 92 because the condition on line 96 was always true

97 (":", "*") 

98 ): 

99 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

100 extract_example_list_item( 

101 wxr, word_entry, sense, child_list_item 

102 ) 

103 

104 

105def extract_label_template( 

106 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

107): 

108 # https://vi.wiktionary.org/wiki/Bản_mẫu:nhãn 

109 expanded_node = wxr.wtp.parse( 

110 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

111 ) 

112 for span_tag in expanded_node.find_html_recursively("span"): 

113 span_classes = span_tag.attrs.get("class", "").split() 

114 if "label-content" in span_classes: 

115 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

116 raw_tag = raw_tag.strip() 

117 if raw_tag != "": 

118 sense.raw_tags.append(raw_tag) 

119 clean_node(wxr, sense, expanded_node) 

120 

121 

122def extract_term_template( 

123 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

124): 

125 # https://vi.wiktionary.org/wiki/Bản_mẫu:term 

126 expanded_node = wxr.wtp.parse( 

127 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

128 ) 

129 for italic_node in expanded_node.find_child(NodeKind.ITALIC): 

130 raw_tag = clean_node(wxr, None, italic_node) 

131 if raw_tag != "": 131 ↛ 129line 131 didn't jump to line 129 because the condition on line 131 was always true

132 sense.raw_tags.append(raw_tag) 

133 

134 

135def extract_form_of_template( 

136 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

137): 

138 # https://vi.wiktionary.org/wiki/Thể_loại:Bản_mẫu_dạng_từ 

139 expanded_node = wxr.wtp.parse( 

140 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

141 ) 

142 form = AltForm(word="") 

143 for i_tag in expanded_node.find_html_recursively("i"): 143 ↛ 146line 143 didn't jump to line 146 because the loop on line 143 didn't complete

144 form.word = clean_node(wxr, None, i_tag) 

145 break 

146 for span_tag in expanded_node.find_html_recursively("span"): 146 ↛ 150line 146 didn't jump to line 150 because the loop on line 146 didn't complete

147 if "mention-tr" in span_tag.attrs.get("class", "").split(): 

148 form.roman = clean_node(wxr, None, span_tag) 

149 break 

150 is_alt_of = ( 

151 "alternative" in t_node.template_name 

152 or t_node.template_name in ALT_OF_TEMPLATES 

153 ) 

154 if form.word != "": 154 ↛ exitline 154 didn't return from function 'extract_form_of_template' because the condition on line 154 was always true

155 if is_alt_of: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 sense.alt_of.append(form) 

157 sense.tags.append("alt-of") 

158 else: 

159 sense.form_of.append(form) 

160 sense.tags.append("form-of") 

161 

162 

163def extract_at_template( 

164 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

165): 

166 # https://vi.wiktionary.org/wiki/Thể_loại:@ 

167 # obsolete template 

168 expanded_node = wxr.wtp.parse( 

169 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

170 ) 

171 for i_tag in expanded_node.find_html("i"): 

172 text = clean_node(wxr, None, i_tag) 

173 for raw_tag in re.split(r",|;", text): 

174 raw_tag = raw_tag.strip() 

175 if raw_tag != "": 

176 sense.raw_tags.append(raw_tag) 

177 

178 

179def extract_note_section( 

180 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

181): 

182 has_list = False 

183 for list_node in level_node.find_child(NodeKind.LIST): 

184 has_list = True 

185 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

186 note = clean_node(wxr, None, list_item.children) 

187 if note != "": 

188 word_entry.notes.append(note) 

189 if not has_list: 

190 note = clean_node( 

191 wxr, 

192 None, 

193 list( 

194 level_node.invert_find_child( 

195 LEVEL_KIND_FLAGS, include_empty_str=True 

196 ) 

197 ), 

198 ) 

199 if note != "": 

200 word_entry.notes.append(note) 

201 

202 

203def extract_headword_template( 

204 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

205): 

206 raw_tag = "" 

207 expanded_node = wxr.wtp.parse( 

208 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

209 ) 

210 for node in expanded_node.find_child_recursively( 

211 NodeKind.ITALIC | NodeKind.HTML 

212 ): 

213 if node.kind == NodeKind.ITALIC: 

214 raw_tag = clean_node(wxr, None, node) 

215 elif ( 

216 isinstance(node, HTMLNode) 

217 and node.tag == "span" 

218 and "form-of" in node.attrs.get("class", "").split() 

219 ): 

220 form = Form(form=clean_node(wxr, None, node)) 

221 if raw_tag != "": 221 ↛ 225line 221 didn't jump to line 225 because the condition on line 221 was always true

222 form.raw_tags.append(raw_tag) 

223 translate_raw_tags(form) 

224 raw_tag = "" 

225 if form.form != "": 225 ↛ 210line 225 didn't jump to line 210 because the condition on line 225 was always true

226 word_entry.forms.append(form) 

227 

228 for link_node in expanded_node.find_child(NodeKind.LINK): 

229 clean_node(wxr, word_entry, link_node) 

230 

231 

232def extract_zh_mw_template( 

233 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

234): 

235 # Chinese inline classifier template 

236 # https://zh.wiktionary.org/wiki/Bản_mẫu:zho-mw 

237 expanded_node = wxr.wtp.parse( 

238 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

239 ) 

240 classifiers = [] 

241 last_word = "" 

242 for span_tag in expanded_node.find_html_recursively("span"): 

243 span_class = span_tag.attrs.get("class", "") 

244 if span_class in ["Hani", "Hant", "Hans"]: 

245 word = clean_node(wxr, None, span_tag) 

246 if word != "/": 

247 classifier = Classifier(classifier=word) 

248 if span_class == "Hant": 

249 classifier.tags.append("Traditional-Chinese") 

250 elif span_class == "Hans": 

251 classifier.tags.append("Simplified-Chinese") 

252 

253 if len(classifiers) > 0 and last_word != "/": 

254 sense.classifiers.extend(classifiers) 

255 classifiers.clear() 

256 classifiers.append(classifier) 

257 last_word = word 

258 elif "title" in span_tag.attrs: 

259 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

260 if len(raw_tag) > 0: 

261 for classifier in classifiers: 

262 classifier.raw_tags.append(raw_tag) 

263 sense.classifiers.extend(classifiers) 

264 for classifier in sense.classifiers: 

265 translate_raw_tags(classifier) 

266 for link in expanded_node.find_child(NodeKind.LINK): 

267 clean_node(wxr, sense, link)