Coverage for src/wiktextract/extractor/vi/pos.py: 58%

1import re

3from wikitextprocessor.parser import (

4 LEVEL_KIND_FLAGS,

5 HTMLNode,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from ..ruby import extract_ruby

15from .example import extract_example_list_item

16from .models import AltForm, Classifier, Form, Sense, WordEntry

17from .section_titles import POS_DATA

18from .tags import translate_raw_tags

21def extract_pos_section(

22 wxr: WiktextractContext,

23 page_data: list[WordEntry],

24 base_data: WordEntry,

25 level_node: LevelNode,

26 pos_title: str,

27):

28 page_data.append(base_data.model_copy(deep=True))

29 page_data[-1].pos_title = pos_title

30 pos_data = POS_DATA[pos_title]

31 page_data[-1].pos = pos_data["pos"]

32 base_data.pos = pos_data["pos"]

33 page_data[-1].tags.extend(pos_data.get("tags", []))

35 gloss_list_index = len(level_node.children)

36 for index, list_node in level_node.find_child(NodeKind.LIST, True):

37 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

38 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):

39 extract_gloss_list_item(wxr, page_data[-1], list_item)

40 if index < gloss_list_index:

41 gloss_list_index = index

43 for node in level_node.children[:gloss_list_index]:

44 if isinstance(node, TemplateNode):

45 extract_headword_template(wxr, page_data[-1], node)

48# redirect

49ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "vi-alt sp", "vie-alt sp"])

50FORM_OF_TEMPLATES = frozenset(["số nhiều của", "short for"])

53def extract_gloss_list_item(

54 wxr: WiktextractContext,

55 word_entry: WordEntry,

56 list_item: WikiNode,

57 parent_sense: Sense | None = None,

58):

59 sense = (

60 parent_sense.model_copy(deep=True)

61 if parent_sense is not None

62 else Sense()

63 )

64 sense.examples.clear()

65 gloss_nodes = []

66 for node in list_item.children:

67 if isinstance(node, TemplateNode):

68 if node.template_name in ["nhãn", "label", "def-lb", "context"]: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 extract_label_template(wxr, sense, node)

70 elif node.template_name == "term":

71 extract_term_template(wxr, sense, node)

72 elif ( 72 ↛ 79line 72 didn't jump to line 79 because the condition on line 72 was always true

73 node.template_name.endswith((" of", "-of"))

74 or node.template_name in ALT_OF_TEMPLATES

75 or node.template_name in FORM_OF_TEMPLATES

76 ):

77 extract_form_of_template(wxr, sense, node)

78 gloss_nodes.append(node)

79 elif node.template_name == "@":

80 extract_at_template(wxr, sense, node)

81 elif node.template_name in ["zho-mw", "zh-mw"]:

82 extract_zh_mw_template(wxr, node, sense)

83 else:

84 gloss_nodes.append(node)

85 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):

86 gloss_nodes.append(node)

87 gloss_str = clean_node(wxr, sense, gloss_nodes)

88 if gloss_str != "": 88 ↛ 93line 88 didn't jump to line 93 because the condition on line 88 was always true

89 sense.glosses.append(gloss_str)

90 translate_raw_tags(sense)

91 word_entry.senses.append(sense)

93 for child_list in list_item.find_child(NodeKind.LIST):

94 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):

95 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):

96 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)

97 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 97 ↛ 93line 97 didn't jump to line 93 because the condition on line 97 was always true

98 (":", "*")

99 ):

100 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):

101 extract_example_list_item(

102 wxr, word_entry, sense, child_list_item

103 )

104

105

106def extract_label_template(

107 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode

108):

109 # https://vi.wiktionary.org/wiki/Bản_mẫu:nhãn

110 expanded_node = wxr.wtp.parse(

111 wxr.wtp.node_to_wikitext(t_node), expand_all=True

112 )

113 for span_tag in expanded_node.find_html_recursively("span"):

114 span_classes = span_tag.attrs.get("class", "").split()

115 if "label-content" in span_classes:

116 for raw_tag in clean_node(wxr, None, span_tag).split(","):

117 raw_tag = raw_tag.strip()

118 if raw_tag != "":

119 sense.raw_tags.append(raw_tag)

120 clean_node(wxr, sense, expanded_node)

121

122

123def extract_term_template(

124 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode

125):

126 # https://vi.wiktionary.org/wiki/Bản_mẫu:term

127 expanded_node = wxr.wtp.parse(

128 wxr.wtp.node_to_wikitext(t_node), expand_all=True

129 )

130 for italic_node in expanded_node.find_child(NodeKind.ITALIC):

131 raw_tag = clean_node(wxr, None, italic_node)

132 if raw_tag != "": 132 ↛ 130line 132 didn't jump to line 130 because the condition on line 132 was always true

133 sense.raw_tags.append(raw_tag)

134

135

136def extract_form_of_template(

137 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode

138):

139 # https://vi.wiktionary.org/wiki/Thể_loại:Bản_mẫu_dạng_từ

140 expanded_node = wxr.wtp.parse(

141 wxr.wtp.node_to_wikitext(t_node), expand_all=True

142 )

143 form = AltForm(word="")

144 for i_tag in expanded_node.find_html_recursively("i"): 144 ↛ 147line 144 didn't jump to line 147 because the loop on line 144 didn't complete

145 form.word = clean_node(wxr, None, i_tag)

146 break

147 for span_tag in expanded_node.find_html_recursively("span"): 147 ↛ 151line 147 didn't jump to line 151 because the loop on line 147 didn't complete

148 if "mention-tr" in span_tag.attrs.get("class", "").split():

149 form.roman = clean_node(wxr, None, span_tag)

150 break

151 is_alt_of = (

152 "alternative" in t_node.template_name

153 or t_node.template_name in ALT_OF_TEMPLATES

154 )

155 if form.word != "": 155 ↛ exitline 155 didn't return from function 'extract_form_of_template' because the condition on line 155 was always true

156 if is_alt_of: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 sense.alt_of.append(form)

158 sense.tags.append("alt-of")

159 else:

160 sense.form_of.append(form)

161 sense.tags.append("form-of")

162

163

164def extract_at_template(

165 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode

166):

167 # https://vi.wiktionary.org/wiki/Thể_loại:@

168 # obsolete template

169 expanded_node = wxr.wtp.parse(

170 wxr.wtp.node_to_wikitext(t_node), expand_all=True

171 )

172 for i_tag in expanded_node.find_html("i"):

173 text = clean_node(wxr, None, i_tag)

174 for raw_tag in re.split(r",|;", text):

175 raw_tag = raw_tag.strip()

176 if raw_tag != "":

177 sense.raw_tags.append(raw_tag)

178

179

180def extract_note_section(

181 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

182):

183 has_list = False

184 for list_node in level_node.find_child(NodeKind.LIST):

185 has_list = True

186 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

187 note = clean_node(wxr, None, list_item.children)

188 if note != "":

189 word_entry.notes.append(note)

190 if not has_list:

191 note = clean_node(

192 wxr,

193 None,

194 list(

195 level_node.invert_find_child(

196 LEVEL_KIND_FLAGS, include_empty_str=True

197 )

198 ),

199 )

200 if note != "":

201 word_entry.notes.append(note)

202

203

204def extract_headword_template(

205 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

206):

207 forms = []

208 has_headword_span = False

209 expanded_node = wxr.wtp.parse(

210 wxr.wtp.node_to_wikitext(t_node), expand_all=True

211 )

212 for main_span_tag in expanded_node.find_html(

213 "span", attr_name="class", attr_value="headword-line"

214 ):

215 has_headword_span = True

216 i_tags = []

217 for html_node in main_span_tag.find_child(NodeKind.HTML):

218 class_names = html_node.attrs.get("class", "").split()

219 if html_node.tag == "strong" and "headword" in class_names:

220 ruby, no_ruby = extract_ruby(wxr, html_node)

221 strong_str = clean_node(wxr, None, no_ruby)

222 if strong_str not in ["", wxr.wtp.title] or len(ruby) > 0:

223 forms.append(

224 Form(form=strong_str, tags=["canonical"], ruby=ruby)

225 )

226 elif html_node.tag == "span":

227 if "headword-tr" in class_names or "tr" in class_names:

228 roman = clean_node(wxr, None, html_node)

229 if (

230 len(forms) > 0

231 and "canonical" not in forms[-1].tags

232 and "romanization" not in forms[-1].tags

233 ):

234 forms[-1].roman = roman

235 elif roman != "": 235 ↛ 217line 235 didn't jump to line 217 because the condition on line 235 was always true

236 forms.append(Form(form=roman, tags=["romanization"]))

237 elif "gender" in class_names: 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 for abbr_tag in html_node.find_html("abbr"):

239 gender_tag = abbr_tag.attrs.get(

240 "title", clean_node(wxr, None, abbr_tag)

241 )

242 if (

243 len(forms) > 0

244 and "canonical" not in forms[-1].tags

245 and "romanization" not in forms[-1].tags

246 ):

247 forms[-1].raw_tags.append(gender_tag)

248 else:

249 word_entry.raw_tags.append(gender_tag)

250 elif "ib-content" in class_names: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true

251 raw_tag = clean_node(wxr, None, html_node)

252 if raw_tag != "":

253 word_entry.raw_tags.append(raw_tag)

254 elif html_node.tag == "sup" and word_entry.lang_code == "ja": 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true

255 forms.append(extract_historical_kana(wxr, html_node))

256 elif html_node.tag == "i":

257 if len(i_tags) > 0:

258 word_entry.raw_tags.extend(i_tags)

259 i_tags.clear()

260 for i_child in html_node.children:

261 raw_tag = (

262 clean_node(wxr, None, i_child)

263 .removeprefix("^†")

264 .strip()

265 )

266 if raw_tag != "": 266 ↛ 260line 266 didn't jump to line 260 because the condition on line 266 was always true

267 i_tags.append(raw_tag)

268 elif html_node.tag == "b": 268 ↛ 217line 268 didn't jump to line 217 because the condition on line 268 was always true

269 ruby, no_ruby = extract_ruby(wxr, html_node)

270 for form_str in filter(

271 None,

272 map(str.strip, clean_node(wxr, None, no_ruby).split(",")),

273 ):

274 form = Form(form=form_str, ruby=ruby)

275 if i_tags == ["hoặc"]: 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true

276 if len(forms) > 0:

277 form.raw_tags.extend(forms[-1].raw_tags)

278 else:

279 form.raw_tags.extend(i_tags)

280 forms.append(form)

281 i_tags.clear()

282

283 if len(i_tags) > 0: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true

284 word_entry.raw_tags.extend(i_tags)

285 new_forms = []

286 for form in forms:

287 if "loại từ" in form.raw_tags: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true

288 translate_raw_tags(form)

289 word_entry.classifiers.append(

290 Classifier(

291 classifier=form.form, tags=form.tags, raw_tags=form.raw_tags

292 )

293 )

294 else:

295 translate_raw_tags(form)

296 new_forms.append(form)

297 word_entry.forms.extend(new_forms)

298 clean_node(wxr, word_entry, expanded_node)

299 translate_raw_tags(word_entry)

300

301 if not has_headword_span:

302 # Template:eng-noun

303 raw_tag = ""

304 for node in expanded_node.find_child_recursively(

305 NodeKind.ITALIC | NodeKind.HTML

306 ):

307 if node.kind == NodeKind.ITALIC:

308 raw_tag = clean_node(wxr, None, node)

309 elif (

310 isinstance(node, HTMLNode)

311 and node.tag == "span"

312 and "form-of" in node.attrs.get("class", "").split()

313 ):

314 form = Form(form=clean_node(wxr, None, node))

315 if raw_tag != "": 315 ↛ 319line 315 didn't jump to line 319 because the condition on line 315 was always true

316 form.raw_tags.append(raw_tag)

317 translate_raw_tags(form)

318 raw_tag = ""

319 if form.form != "": 319 ↛ 304line 319 didn't jump to line 304 because the condition on line 319 was always true

320 word_entry.forms.append(form)

321

322

323def extract_historical_kana(

324 wxr: WiktextractContext, sup_node: HTMLNode

325) -> Form:

326 form = Form(form="", tags=["archaic"])

327 for strong_node in sup_node.find_html("strong"):

328 form.form = clean_node(wxr, None, strong_node)

329 for span_node in sup_node.find_html(

330 "span", attr_name="class", attr_value="tr"

331 ):

332 form.roman = clean_node(wxr, None, span_node)

333 return form

334

335

336def extract_zh_mw_template(

337 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense

338):

339 # Chinese inline classifier template

340 # https://zh.wiktionary.org/wiki/Bản_mẫu:zho-mw

341 expanded_node = wxr.wtp.parse(

342 wxr.wtp.node_to_wikitext(t_node), expand_all=True

343 )

344 classifiers = []

345 last_word = ""

346 for span_tag in expanded_node.find_html_recursively("span"):

347 span_class = span_tag.attrs.get("class", "")

348 if span_class in ["Hani", "Hant", "Hans"]:

349 word = clean_node(wxr, None, span_tag)

350 if word != "／":

351 classifier = Classifier(classifier=word)

352 if span_class == "Hant":

353 classifier.tags.append("Traditional-Chinese")

354 elif span_class == "Hans":

355 classifier.tags.append("Simplified-Chinese")

356

357 if len(classifiers) > 0 and last_word != "／":

358 sense.classifiers.extend(classifiers)

359 classifiers.clear()

360 classifiers.append(classifier)

361 last_word = word

362 elif "title" in span_tag.attrs:

363 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])

364 if len(raw_tag) > 0:

365 for classifier in classifiers:

366 classifier.raw_tags.append(raw_tag)

367 sense.classifiers.extend(classifiers)

368 for classifier in sense.classifiers:

369 translate_raw_tags(classifier)

370 for link in expanded_node.find_child(NodeKind.LINK):

371 clean_node(wxr, sense, link)

Coverage for src / wiktextract / extractor / vi / pos.py: 58%

225 statements