Coverage for src / wiktextract / extractor / th / pos.py: 77%

240 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import itertools 

2import re 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..ruby import extract_ruby 

15from .example import extract_example_list_item 

16from .models import AltForm, Classifier, Form, Sense, WordEntry 

17from .section_titles import POS_DATA 

18from .tags import translate_raw_tags 

19 

20 

21def extract_pos_section( 

22 wxr: WiktextractContext, 

23 page_data: list[WordEntry], 

24 base_data: WordEntry, 

25 level_node: LevelNode, 

26 pos_title: str, 

27) -> None: 

28 page_data.append(base_data.model_copy(deep=True)) 

29 page_data[-1].pos_title = pos_title 

30 pos_data = POS_DATA[pos_title] 

31 page_data[-1].pos = pos_data["pos"] 

32 base_data.pos = pos_data["pos"] 

33 page_data[-1].tags.extend(pos_data.get("tags", [])) 

34 

35 gloss_list_index = len(level_node.children) 

36 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

37 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

38 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 

39 extract_gloss_list_item(wxr, page_data[-1], list_item) 

40 if index < gloss_list_index: 40 ↛ 37line 40 didn't jump to line 37 because the condition on line 40 was always true

41 gloss_list_index = index 

42 

43 for node in level_node.children[:gloss_list_index]: 

44 if isinstance(node, TemplateNode) and node.template_name == "th-noun": 

45 extract_th_noun_template(wxr, page_data[-1], node) 

46 elif isinstance(node, TemplateNode) and node.template_name in [ 

47 "th-verb", 

48 "th-adj", 

49 ]: 

50 extract_th_verb_adj_template(wxr, page_data[-1], node) 

51 elif isinstance(node, TemplateNode): 

52 extract_headword_line_template(wxr, page_data[-1], node) 

53 

54 

55# redirect 

56ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "alt sp", "altsp"]) 

57FORM_OF_TEMPLATES = frozenset(["อักษรย่อ", "คำย่อ"]) 

58 

59 

60def extract_gloss_list_item( 

61 wxr: WiktextractContext, 

62 word_entry: WordEntry, 

63 list_item: WikiNode, 

64 parent_sense: Sense | None = None, 

65) -> None: 

66 sense = ( 

67 parent_sense.model_copy(deep=True) 

68 if parent_sense is not None 

69 else Sense() 

70 ) 

71 gloss_nodes = [] 

72 has_form_of_template = False 

73 for node in list_item.children: 

74 if isinstance(node, TemplateNode) and node.template_name in [ 

75 "label", 

76 "lb", 

77 "lbl", 

78 "qualifier", 

79 "q", 

80 "qf", 

81 "qual", 

82 ]: 

83 extract_label_template(wxr, sense, node) 

84 elif isinstance(node, TemplateNode) and node.template_name == "cls": 

85 extract_cls_template(wxr, sense, node) 

86 elif isinstance(node, TemplateNode) and ( 

87 node.template_name.endswith(" of") 

88 or node.template_name.startswith("alternate ") 

89 or node.template_name in ALT_OF_TEMPLATES 

90 or node.template_name in FORM_OF_TEMPLATES 

91 ): 

92 extract_form_of_template(wxr, word_entry, sense, node) 

93 has_form_of_template = True 

94 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 extract_zh_mw_template(wxr, node, sense) 

96 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

97 gloss_nodes.append(node) 

98 

99 if not has_form_of_template: 

100 gloss_str = clean_node(wxr, sense, gloss_nodes) 

101 if gloss_str != "": 101 ↛ 106line 101 didn't jump to line 106 because the condition on line 101 was always true

102 sense.glosses.append(gloss_str) 

103 translate_raw_tags(sense) 

104 word_entry.senses.append(sense) 

105 

106 for child_list in list_item.find_child(NodeKind.LIST): 

107 if child_list.sarg.startswith("#") and child_list.sarg.endswith( 

108 (":", "*") 

109 ): 

110 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

111 extract_example_list_item(wxr, word_entry, sense, e_list_item) 

112 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 112 ↛ 106line 112 didn't jump to line 106 because the condition on line 112 was always true

113 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

114 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

115 

116 

117def extract_label_template( 

118 wxr: WiktextractContext, 

119 sense: Sense, 

120 t_node: TemplateNode, 

121) -> None: 

122 # https://th.wiktionary.org/wiki/แม่แบบ:label 

123 expanded_node = wxr.wtp.parse( 

124 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

125 ) 

126 for span_tag in expanded_node.find_html_recursively( 

127 "span", attr_name="class", attr_value="ib-content" 

128 ): 

129 span_str = clean_node(wxr, None, span_tag) 

130 for raw_tag in re.split(r",| หรือ ", span_str): 

131 raw_tag = raw_tag.strip() 

132 if raw_tag != "": 132 ↛ 130line 132 didn't jump to line 130 because the condition on line 132 was always true

133 sense.raw_tags.append(raw_tag) 

134 clean_node(wxr, sense, expanded_node) 

135 

136 

137def extract_cls_template( 

138 wxr: WiktextractContext, 

139 sense: Sense, 

140 t_node: TemplateNode, 

141) -> None: 

142 # https://th.wiktionary.org/wiki/แม่แบบ:cls 

143 for arg_name in itertools.count(2): 143 ↛ 149line 143 didn't jump to line 149 because the loop on line 143 didn't complete

144 if arg_name not in t_node.template_parameters: 

145 break 

146 cls = clean_node(wxr, None, t_node.template_parameters[arg_name]) 

147 if cls != "": 147 ↛ 143line 147 didn't jump to line 143 because the condition on line 147 was always true

148 sense.classifiers.append(Classifier(classifier=cls)) 

149 clean_node(wxr, sense, t_node) 

150 

151 

152def extract_th_noun_template( 

153 wxr: WiktextractContext, 

154 word_entry: WordEntry, 

155 t_node: TemplateNode, 

156) -> None: 

157 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun 

158 expanded_node = wxr.wtp.parse( 

159 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

160 ) 

161 for b_tag in expanded_node.find_html_recursively("b"): 

162 cls = clean_node(wxr, None, b_tag) 

163 if cls != "": 163 ↛ 161line 163 didn't jump to line 161 because the condition on line 163 was always true

164 word_entry.classifiers.append(Classifier(classifier=cls)) 

165 

166 clean_node(wxr, word_entry, expanded_node) 

167 

168 

169def extract_th_verb_adj_template( 

170 wxr: WiktextractContext, 

171 word_entry: WordEntry, 

172 t_node: TemplateNode, 

173) -> None: 

174 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun 

175 # https://th.wiktionary.org/wiki/แม่แบบ:th-adj 

176 expanded_node = wxr.wtp.parse( 

177 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

178 ) 

179 for b_tag in expanded_node.find_html_recursively("b"): 

180 form_str = clean_node(wxr, None, b_tag) 

181 if form_str != "": 181 ↛ 179line 181 didn't jump to line 179 because the condition on line 181 was always true

182 word_entry.forms.append( 

183 Form( 

184 form=form_str, 

185 tags=[ 

186 "abstract-noun" 

187 if t_node.template_name == "th-verb" 

188 else "noun-from-adj" 

189 ], 

190 ) 

191 ) 

192 

193 clean_node(wxr, word_entry, expanded_node) 

194 

195 

196def extract_note_section( 

197 wxr: WiktextractContext, 

198 word_entry: WordEntry, 

199 level_node: LevelNode, 

200) -> None: 

201 for list_node in level_node.find_child(NodeKind.LIST): 

202 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

203 note_str = clean_node( 

204 wxr, 

205 word_entry, 

206 list( 

207 list_item.invert_find_child( 

208 NodeKind.LIST, include_empty_str=True 

209 ) 

210 ), 

211 ) 

212 if note_str != "": 

213 word_entry.notes.append(note_str) 

214 

215 

216def extract_form_of_template( 

217 wxr: WiktextractContext, 

218 word_entry: WordEntry, 

219 first_sense: Sense, 

220 t_node: TemplateNode, 

221) -> None: 

222 form = AltForm(word="") 

223 expanded_node = wxr.wtp.parse( 

224 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

225 ) 

226 senses = [] 

227 if expanded_node.contain_node(NodeKind.LIST): 

228 first_list_idx = len(expanded_node.children) 

229 first_gloss = "" 

230 for index, node in enumerate(expanded_node.children): 

231 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

232 if index < first_list_idx: 232 ↛ 240line 232 didn't jump to line 240 because the condition on line 232 was always true

233 first_list_idx = index 

234 first_gloss = clean_node( 

235 wxr, first_sense, expanded_node.children[:index] 

236 ) 

237 if first_gloss != "": 237 ↛ 240line 237 didn't jump to line 240 because the condition on line 237 was always true

238 first_sense.glosses.append(first_gloss) 

239 senses.append(first_sense) 

240 for list_item in node.find_child(NodeKind.LIST_ITEM): 

241 sense = Sense() 

242 if first_gloss != "": 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was always true

243 sense.glosses.append(first_gloss) 

244 gloss = clean_node(wxr, sense, list_item.children) 

245 if gloss != "": 245 ↛ 240line 245 didn't jump to line 240 because the condition on line 245 was always true

246 sense.glosses.append(gloss) 

247 senses.append(sense) 

248 else: 

249 gloss = clean_node(wxr, first_sense, expanded_node) 

250 if gloss != "": 250 ↛ 254line 250 didn't jump to line 254 because the condition on line 250 was always true

251 first_sense.glosses.append(gloss) 

252 senses.append(first_sense) 

253 

254 for i_tag in expanded_node.find_html_recursively("i"): 254 ↛ 257line 254 didn't jump to line 257 because the loop on line 254 didn't complete

255 form.word = clean_node(wxr, None, i_tag) 

256 break 

257 for span_tag in expanded_node.find_html_recursively("span"): 

258 if "mention-tr" in span_tag.attrs.get("class", ""): 

259 form.roman = clean_node(wxr, None, span_tag) 

260 break 

261 is_alt_of = ( 

262 t_node.template_name.startswith(("alternative ", "alternate ")) 

263 or t_node.template_name in ALT_OF_TEMPLATES 

264 ) 

265 if form.word != "": 265 ↛ 275line 265 didn't jump to line 275 because the condition on line 265 was always true

266 for sense in senses: 

267 if is_alt_of: 

268 sense.alt_of.append(form) 

269 else: 

270 sense.form_of.append(form) 

271 if is_alt_of and "alt-of" not in sense.tags: 

272 sense.tags.append("alt-of") 

273 if not is_alt_of and "form-of" not in sense.tags: 

274 sense.tags.append("form-of") 

275 word_entry.senses.extend(senses) 

276 

277 

278def extract_usage_note_section( 

279 wxr: WiktextractContext, 

280 word_entry: WordEntry, 

281 level_node: LevelNode, 

282) -> None: 

283 for list_node in level_node.find_child(NodeKind.LIST): 

284 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

285 note_str = clean_node(wxr, None, list_item.children) 

286 if note_str != "": 

287 word_entry.notes.append(note_str) 

288 

289 

290def extract_zh_mw_template( 

291 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

292) -> None: 

293 # Chinese inline classifier template 

294 # copied from zh edition code 

295 expanded_node = wxr.wtp.parse( 

296 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

297 ) 

298 classifiers = [] 

299 last_word = "" 

300 for span_tag in expanded_node.find_html_recursively("span"): 

301 span_class = span_tag.attrs.get("class", "") 

302 if span_class in ["Hani", "Hant", "Hans"]: 

303 word = clean_node(wxr, None, span_tag) 

304 if word != "/": 

305 classifier = Classifier(classifier=word) 

306 if span_class == "Hant": 

307 classifier.tags.append("Traditional-Chinese") 

308 elif span_class == "Hans": 

309 classifier.tags.append("Simplified-Chinese") 

310 

311 if len(classifiers) > 0 and last_word != "/": 

312 sense.classifiers.extend(classifiers) 

313 classifiers.clear() 

314 classifiers.append(classifier) 

315 last_word = word 

316 elif "title" in span_tag.attrs: 

317 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

318 if len(raw_tag) > 0: 

319 for classifier in classifiers: 

320 classifier.raw_tags.append(raw_tag) 

321 sense.classifiers.extend(classifiers) 

322 for classifier in sense.classifiers: 

323 translate_raw_tags(classifier) 

324 

325 

326def extract_headword_line_template( 

327 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

328): 

329 forms = [] 

330 expanded_node = wxr.wtp.parse( 

331 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

332 ) 

333 for main_span_tag in expanded_node.find_html( 

334 "span", attr_name="class", attr_value="headword-line" 

335 ): 

336 i_tags = [] 

337 for html_node in main_span_tag.find_child(NodeKind.HTML): 

338 class_names = html_node.attrs.get("class", "").split() 

339 if html_node.tag == "strong" and "headword" in class_names: 

340 ruby, no_ruby = extract_ruby(wxr, html_node) 

341 strong_str = clean_node(wxr, None, no_ruby) 

342 if strong_str not in ["", wxr.wtp.title] or len(ruby) > 0: 

343 forms.append( 

344 Form(form=strong_str, tags=["canonical"], ruby=ruby) 

345 ) 

346 elif html_node.tag == "span": 

347 if "headword-tr" in class_names or "tr" in class_names: 

348 roman = clean_node(wxr, None, html_node) 

349 if ( 

350 len(forms) > 0 

351 and "canonical" not in forms[-1].tags 

352 and "romanization" not in forms[-1].tags 

353 ): 

354 forms[-1].roman = roman 

355 elif roman != "": 355 ↛ 337line 355 didn't jump to line 337 because the condition on line 355 was always true

356 forms.append(Form(form=roman, tags=["romanization"])) 

357 elif "gender" in class_names: 

358 for abbr_tag in html_node.find_html("abbr"): 

359 gender_tag = clean_node(wxr, None, abbr_tag) 

360 if ( 360 ↛ 365line 360 didn't jump to line 365 because the condition on line 360 was never true

361 len(forms) > 0 

362 and "canonical" not in forms[-1].tags 

363 and "romanization" not in forms[-1].tags 

364 ): 

365 forms[-1].raw_tags.append(gender_tag) 

366 else: 

367 word_entry.raw_tags.append(gender_tag) 

368 elif "ib-content" in class_names: 368 ↛ 369line 368 didn't jump to line 369 because the condition on line 368 was never true

369 raw_tag = clean_node(wxr, None, html_node) 

370 if raw_tag != "": 

371 word_entry.raw_tags.append(raw_tag) 

372 elif html_node.tag == "sup" and word_entry.lang_code == "ja": 

373 forms.append(extract_historical_kana(wxr, html_node)) 

374 elif html_node.tag == "i": 

375 if len(i_tags) > 0: 

376 word_entry.raw_tags.extend(i_tags) 

377 i_tags.clear() 

378 for i_child in html_node.children: 

379 raw_tag = ( 

380 clean_node(wxr, None, i_child) 

381 .removeprefix("^†") 

382 .strip() 

383 ) 

384 if raw_tag != "": 384 ↛ 378line 384 didn't jump to line 378 because the condition on line 384 was always true

385 i_tags.append(raw_tag) 

386 elif html_node.tag == "b": 386 ↛ 337line 386 didn't jump to line 337 because the condition on line 386 was always true

387 ruby, no_ruby = extract_ruby(wxr, html_node) 

388 for form_str in filter( 

389 None, 

390 map(str.strip, clean_node(wxr, None, no_ruby).split(",")), 

391 ): 

392 form = Form(form=form_str, ruby=ruby) 

393 if i_tags == ["หรือ"]: 

394 if len(forms) > 0: 394 ↛ 398line 394 didn't jump to line 398 because the condition on line 394 was always true

395 form.raw_tags.extend(forms[-1].raw_tags) 

396 else: 

397 form.raw_tags.extend(i_tags) 

398 forms.append(form) 

399 i_tags.clear() 

400 

401 if len(i_tags) > 0: 

402 word_entry.raw_tags.extend(i_tags) 

403 for form in forms: 

404 translate_raw_tags(form) 

405 word_entry.forms.extend(forms) 

406 clean_node(wxr, word_entry, expanded_node) 

407 translate_raw_tags(word_entry) 

408 

409 

410def extract_historical_kana( 

411 wxr: WiktextractContext, sup_node: HTMLNode 

412) -> Form: 

413 form = Form(form="", tags=["archaic"]) 

414 for strong_node in sup_node.find_html("strong"): 

415 form.form = clean_node(wxr, None, strong_node) 

416 for span_node in sup_node.find_html( 

417 "span", attr_name="class", attr_value="tr" 

418 ): 

419 form.roman = clean_node(wxr, None, span_node) 

420 return form