Coverage for src / wiktextract / extractor / vi / pos.py: 58%

220 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1import re 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..ruby import extract_ruby 

15from .example import extract_example_list_item 

16from .models import AltForm, Classifier, Form, Sense, WordEntry 

17from .section_titles import POS_DATA 

18from .tags import translate_raw_tags 

19 

20 

21def extract_pos_section( 

22 wxr: WiktextractContext, 

23 page_data: list[WordEntry], 

24 base_data: WordEntry, 

25 level_node: LevelNode, 

26 pos_title: str, 

27): 

28 page_data.append(base_data.model_copy(deep=True)) 

29 page_data[-1].pos_title = pos_title 

30 pos_data = POS_DATA[pos_title] 

31 page_data[-1].pos = pos_data["pos"] 

32 base_data.pos = pos_data["pos"] 

33 page_data[-1].tags.extend(pos_data.get("tags", [])) 

34 

35 gloss_list_index = len(level_node.children) 

36 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

37 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

38 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 

39 extract_gloss_list_item(wxr, page_data[-1], list_item) 

40 if index < gloss_list_index: 

41 gloss_list_index = index 

42 

43 for node in level_node.children[:gloss_list_index]: 

44 if isinstance(node, TemplateNode): 

45 extract_headword_template(wxr, page_data[-1], node) 

46 

47 

48# redirect 

49ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "vi-alt sp", "vie-alt sp"]) 

50FORM_OF_TEMPLATES = frozenset(["số nhiều của", "short for"]) 

51 

52 

53def extract_gloss_list_item( 

54 wxr: WiktextractContext, 

55 word_entry: WordEntry, 

56 list_item: WikiNode, 

57 parent_sense: Sense | None = None, 

58): 

59 sense = ( 

60 parent_sense.model_copy(deep=True) 

61 if parent_sense is not None 

62 else Sense() 

63 ) 

64 sense.examples.clear() 

65 gloss_nodes = [] 

66 for node in list_item.children: 

67 if isinstance(node, TemplateNode): 

68 if node.template_name in ["nhãn", "label", "def-lb", "context"]: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 extract_label_template(wxr, sense, node) 

70 elif node.template_name == "term": 

71 extract_term_template(wxr, sense, node) 

72 elif ( 72 ↛ 79line 72 didn't jump to line 79 because the condition on line 72 was always true

73 node.template_name.endswith((" of", "-of")) 

74 or node.template_name in ALT_OF_TEMPLATES 

75 or node.template_name in FORM_OF_TEMPLATES 

76 ): 

77 extract_form_of_template(wxr, sense, node) 

78 gloss_nodes.append(node) 

79 elif node.template_name == "@": 

80 extract_at_template(wxr, sense, node) 

81 elif node.template_name in ["zho-mw", "zh-mw"]: 

82 extract_zh_mw_template(wxr, node, sense) 

83 else: 

84 gloss_nodes.append(node) 

85 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

86 gloss_nodes.append(node) 

87 gloss_str = clean_node(wxr, sense, gloss_nodes) 

88 if gloss_str != "": 88 ↛ 93line 88 didn't jump to line 93 because the condition on line 88 was always true

89 sense.glosses.append(gloss_str) 

90 translate_raw_tags(sense) 

91 word_entry.senses.append(sense) 

92 

93 for child_list in list_item.find_child(NodeKind.LIST): 

94 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

95 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

96 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

97 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 97 ↛ 93line 97 didn't jump to line 93 because the condition on line 97 was always true

98 (":", "*") 

99 ): 

100 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

101 extract_example_list_item( 

102 wxr, word_entry, sense, child_list_item 

103 ) 

104 

105 

106def extract_label_template( 

107 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

108): 

109 # https://vi.wiktionary.org/wiki/Bản_mẫu:nhãn 

110 expanded_node = wxr.wtp.parse( 

111 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

112 ) 

113 for span_tag in expanded_node.find_html_recursively("span"): 

114 span_classes = span_tag.attrs.get("class", "").split() 

115 if "label-content" in span_classes: 

116 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

117 raw_tag = raw_tag.strip() 

118 if raw_tag != "": 

119 sense.raw_tags.append(raw_tag) 

120 clean_node(wxr, sense, expanded_node) 

121 

122 

123def extract_term_template( 

124 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

125): 

126 # https://vi.wiktionary.org/wiki/Bản_mẫu:term 

127 expanded_node = wxr.wtp.parse( 

128 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

129 ) 

130 for italic_node in expanded_node.find_child(NodeKind.ITALIC): 

131 raw_tag = clean_node(wxr, None, italic_node) 

132 if raw_tag != "": 132 ↛ 130line 132 didn't jump to line 130 because the condition on line 132 was always true

133 sense.raw_tags.append(raw_tag) 

134 

135 

136def extract_form_of_template( 

137 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

138): 

139 # https://vi.wiktionary.org/wiki/Thể_loại:Bản_mẫu_dạng_từ 

140 expanded_node = wxr.wtp.parse( 

141 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

142 ) 

143 form = AltForm(word="") 

144 for i_tag in expanded_node.find_html_recursively("i"): 144 ↛ 147line 144 didn't jump to line 147 because the loop on line 144 didn't complete

145 form.word = clean_node(wxr, None, i_tag) 

146 break 

147 for span_tag in expanded_node.find_html_recursively("span"): 147 ↛ 151line 147 didn't jump to line 151 because the loop on line 147 didn't complete

148 if "mention-tr" in span_tag.attrs.get("class", "").split(): 

149 form.roman = clean_node(wxr, None, span_tag) 

150 break 

151 is_alt_of = ( 

152 "alternative" in t_node.template_name 

153 or t_node.template_name in ALT_OF_TEMPLATES 

154 ) 

155 if form.word != "": 155 ↛ exitline 155 didn't return from function 'extract_form_of_template' because the condition on line 155 was always true

156 if is_alt_of: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 sense.alt_of.append(form) 

158 sense.tags.append("alt-of") 

159 else: 

160 sense.form_of.append(form) 

161 sense.tags.append("form-of") 

162 

163 

164def extract_at_template( 

165 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

166): 

167 # https://vi.wiktionary.org/wiki/Thể_loại:@ 

168 # obsolete template 

169 expanded_node = wxr.wtp.parse( 

170 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

171 ) 

172 for i_tag in expanded_node.find_html("i"): 

173 text = clean_node(wxr, None, i_tag) 

174 for raw_tag in re.split(r",|;", text): 

175 raw_tag = raw_tag.strip() 

176 if raw_tag != "": 

177 sense.raw_tags.append(raw_tag) 

178 

179 

180def extract_note_section( 

181 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

182): 

183 has_list = False 

184 for list_node in level_node.find_child(NodeKind.LIST): 

185 has_list = True 

186 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

187 note = clean_node(wxr, None, list_item.children) 

188 if note != "": 

189 word_entry.notes.append(note) 

190 if not has_list: 

191 note = clean_node( 

192 wxr, 

193 None, 

194 list( 

195 level_node.invert_find_child( 

196 LEVEL_KIND_FLAGS, include_empty_str=True 

197 ) 

198 ), 

199 ) 

200 if note != "": 

201 word_entry.notes.append(note) 

202 

203 

204def extract_headword_template( 

205 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

206): 

207 forms = [] 

208 has_headword_span = False 

209 expanded_node = wxr.wtp.parse( 

210 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

211 ) 

212 for main_span_tag in expanded_node.find_html( 

213 "span", attr_name="class", attr_value="headword-line" 

214 ): 

215 has_headword_span = True 

216 i_tags = [] 

217 for html_node in main_span_tag.find_child(NodeKind.HTML): 

218 class_names = html_node.attrs.get("class", "").split() 

219 if html_node.tag == "strong" and "headword" in class_names: 

220 ruby, no_ruby = extract_ruby(wxr, html_node) 

221 strong_str = clean_node(wxr, None, no_ruby) 

222 if strong_str not in ["", wxr.wtp.title] or len(ruby) > 0: 

223 forms.append( 

224 Form(form=strong_str, tags=["canonical"], ruby=ruby) 

225 ) 

226 elif html_node.tag == "span": 

227 if "headword-tr" in class_names or "tr" in class_names: 

228 roman = clean_node(wxr, None, html_node) 

229 if ( 

230 len(forms) > 0 

231 and "canonical" not in forms[-1].tags 

232 and "romanization" not in forms[-1].tags 

233 ): 

234 forms[-1].roman = roman 

235 elif roman != "": 235 ↛ 217line 235 didn't jump to line 217 because the condition on line 235 was always true

236 forms.append(Form(form=roman, tags=["romanization"])) 

237 elif "gender" in class_names: 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 for abbr_tag in html_node.find_html("abbr"): 

239 gender_tag = abbr_tag.attrs.get( 

240 "title", clean_node(wxr, None, abbr_tag) 

241 ) 

242 if ( 

243 len(forms) > 0 

244 and "canonical" not in forms[-1].tags 

245 and "romanization" not in forms[-1].tags 

246 ): 

247 forms[-1].raw_tags.append(gender_tag) 

248 else: 

249 word_entry.raw_tags.append(gender_tag) 

250 elif "ib-content" in class_names: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true

251 raw_tag = clean_node(wxr, None, html_node) 

252 if raw_tag != "": 

253 word_entry.raw_tags.append(raw_tag) 

254 elif html_node.tag == "sup" and word_entry.lang_code == "ja": 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true

255 forms.append(extract_historical_kana(wxr, html_node)) 

256 elif html_node.tag == "i": 

257 if len(i_tags) > 0: 

258 word_entry.raw_tags.extend(i_tags) 

259 i_tags.clear() 

260 for i_child in html_node.children: 

261 raw_tag = ( 

262 clean_node(wxr, None, i_child) 

263 .removeprefix("^†") 

264 .strip() 

265 ) 

266 if raw_tag != "": 266 ↛ 260line 266 didn't jump to line 260 because the condition on line 266 was always true

267 i_tags.append(raw_tag) 

268 elif html_node.tag == "b": 268 ↛ 217line 268 didn't jump to line 217 because the condition on line 268 was always true

269 ruby, no_ruby = extract_ruby(wxr, html_node) 

270 for form_str in filter( 

271 None, 

272 map(str.strip, clean_node(wxr, None, no_ruby).split(",")), 

273 ): 

274 form = Form(form=form_str, ruby=ruby) 

275 if i_tags == ["hoặc"]: 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true

276 if len(forms) > 0: 

277 form.raw_tags.extend(forms[-1].raw_tags) 

278 else: 

279 form.raw_tags.extend(i_tags) 

280 forms.append(form) 

281 i_tags.clear() 

282 

283 if len(i_tags) > 0: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true

284 word_entry.raw_tags.extend(i_tags) 

285 for form in forms: 

286 translate_raw_tags(form) 

287 word_entry.forms.extend(forms) 

288 clean_node(wxr, word_entry, expanded_node) 

289 translate_raw_tags(word_entry) 

290 

291 if not has_headword_span: 

292 # Template:eng-noun 

293 raw_tag = "" 

294 for node in expanded_node.find_child_recursively( 

295 NodeKind.ITALIC | NodeKind.HTML 

296 ): 

297 if node.kind == NodeKind.ITALIC: 

298 raw_tag = clean_node(wxr, None, node) 

299 elif ( 

300 isinstance(node, HTMLNode) 

301 and node.tag == "span" 

302 and "form-of" in node.attrs.get("class", "").split() 

303 ): 

304 form = Form(form=clean_node(wxr, None, node)) 

305 if raw_tag != "": 305 ↛ 309line 305 didn't jump to line 309 because the condition on line 305 was always true

306 form.raw_tags.append(raw_tag) 

307 translate_raw_tags(form) 

308 raw_tag = "" 

309 if form.form != "": 309 ↛ 294line 309 didn't jump to line 294 because the condition on line 309 was always true

310 word_entry.forms.append(form) 

311 

312 

313def extract_historical_kana( 

314 wxr: WiktextractContext, sup_node: HTMLNode 

315) -> Form: 

316 form = Form(form="", tags=["archaic"]) 

317 for strong_node in sup_node.find_html("strong"): 

318 form.form = clean_node(wxr, None, strong_node) 

319 for span_node in sup_node.find_html( 

320 "span", attr_name="class", attr_value="tr" 

321 ): 

322 form.roman = clean_node(wxr, None, span_node) 

323 return form 

324 

325 

326def extract_zh_mw_template( 

327 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

328): 

329 # Chinese inline classifier template 

330 # https://zh.wiktionary.org/wiki/Bản_mẫu:zho-mw 

331 expanded_node = wxr.wtp.parse( 

332 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

333 ) 

334 classifiers = [] 

335 last_word = "" 

336 for span_tag in expanded_node.find_html_recursively("span"): 

337 span_class = span_tag.attrs.get("class", "") 

338 if span_class in ["Hani", "Hant", "Hans"]: 

339 word = clean_node(wxr, None, span_tag) 

340 if word != "/": 

341 classifier = Classifier(classifier=word) 

342 if span_class == "Hant": 

343 classifier.tags.append("Traditional-Chinese") 

344 elif span_class == "Hans": 

345 classifier.tags.append("Simplified-Chinese") 

346 

347 if len(classifiers) > 0 and last_word != "/": 

348 sense.classifiers.extend(classifiers) 

349 classifiers.clear() 

350 classifiers.append(classifier) 

351 last_word = word 

352 elif "title" in span_tag.attrs: 

353 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

354 if len(raw_tag) > 0: 

355 for classifier in classifiers: 

356 classifier.raw_tags.append(raw_tag) 

357 sense.classifiers.extend(classifiers) 

358 for classifier in sense.classifiers: 

359 translate_raw_tags(classifier) 

360 for link in expanded_node.find_child(NodeKind.LINK): 

361 clean_node(wxr, sense, link)