Coverage for src/wiktextract/extractor/th/pos.py: 75%

204 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import itertools 

2import re 

3 

4from wikitextprocessor import ( 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .example import extract_example_list_item 

14from .models import AltForm, Classifier, Form, Sense, WordEntry 

15from .section_titles import POS_DATA 

16from .tags import translate_raw_tags 

17 

18 

19def extract_pos_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24 pos_title: str, 

25) -> None: 

26 page_data.append(base_data.model_copy(deep=True)) 

27 page_data[-1].pos_title = pos_title 

28 pos_data = POS_DATA[pos_title] 

29 page_data[-1].pos = pos_data["pos"] 

30 base_data.pos = pos_data["pos"] 

31 page_data[-1].tags.extend(pos_data.get("tags", [])) 

32 

33 gloss_list_index = len(level_node.children) 

34 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

35 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

36 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 

37 extract_gloss_list_item(wxr, page_data[-1], list_item) 

38 if index < gloss_list_index: 38 ↛ 35line 38 didn't jump to line 35 because the condition on line 38 was always true

39 gloss_list_index = index 

40 

41 for node in level_node.children[:gloss_list_index]: 

42 if isinstance(node, TemplateNode) and node.template_name == "th-noun": 

43 extract_th_noun_template(wxr, page_data[-1], node) 

44 elif isinstance(node, TemplateNode) and node.template_name in [ 

45 "th-verb", 

46 "th-adj", 

47 ]: 

48 extract_th_verb_adj_template(wxr, page_data[-1], node) 

49 elif isinstance(node, TemplateNode): 

50 extract_headword_line_template(wxr, page_data[-1], node) 

51 

52 

53# redirect 

54ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "alt sp", "altsp"]) 

55FORM_OF_TEMPLATES = frozenset(["อักษรย่อ", "คำย่อ"]) 

56 

57 

58def extract_gloss_list_item( 

59 wxr: WiktextractContext, 

60 word_entry: WordEntry, 

61 list_item: WikiNode, 

62 parent_sense: Sense | None = None, 

63) -> None: 

64 sense = ( 

65 parent_sense.model_copy(deep=True) 

66 if parent_sense is not None 

67 else Sense() 

68 ) 

69 gloss_nodes = [] 

70 has_form_of_template = False 

71 for node in list_item.children: 

72 if isinstance(node, TemplateNode) and node.template_name in [ 

73 "label", 

74 "lb", 

75 "lbl", 

76 ]: 

77 extract_label_template(wxr, sense, node) 

78 elif isinstance(node, TemplateNode) and node.template_name == "cls": 

79 extract_cls_template(wxr, sense, node) 

80 elif isinstance(node, TemplateNode) and ( 

81 node.template_name.endswith(" of") 

82 or node.template_name.startswith("alternate ") 

83 or node.template_name in ALT_OF_TEMPLATES 

84 or node.template_name in FORM_OF_TEMPLATES 

85 ): 

86 extract_form_of_template(wxr, word_entry, sense, node) 

87 has_form_of_template = True 

88 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 extract_zh_mw_template(wxr, node, sense) 

90 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

91 gloss_nodes.append(node) 

92 

93 if not has_form_of_template: 

94 gloss_str = clean_node(wxr, sense, gloss_nodes) 

95 if gloss_str != "": 95 ↛ 100line 95 didn't jump to line 100 because the condition on line 95 was always true

96 sense.glosses.append(gloss_str) 

97 translate_raw_tags(sense) 

98 word_entry.senses.append(sense) 

99 

100 for child_list in list_item.find_child(NodeKind.LIST): 

101 if child_list.sarg.startswith("#") and child_list.sarg.endswith( 

102 (":", "*") 

103 ): 

104 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

105 extract_example_list_item(wxr, word_entry, sense, e_list_item) 

106 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 106 ↛ 100line 106 didn't jump to line 100 because the condition on line 106 was always true

107 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

108 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

109 

110 

111def extract_label_template( 

112 wxr: WiktextractContext, 

113 sense: Sense, 

114 t_node: TemplateNode, 

115) -> None: 

116 # https://th.wiktionary.org/wiki/แม่แบบ:label 

117 expanded_node = wxr.wtp.parse( 

118 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

119 ) 

120 for span_tag in expanded_node.find_html_recursively( 

121 "span", attr_name="class", attr_value="ib-content" 

122 ): 

123 span_str = clean_node(wxr, None, span_tag) 

124 for raw_tag in re.split(r",| หรือ ", span_str): 

125 raw_tag = raw_tag.strip() 

126 if raw_tag != "": 126 ↛ 124line 126 didn't jump to line 124 because the condition on line 126 was always true

127 sense.raw_tags.append(raw_tag) 

128 clean_node(wxr, sense, expanded_node) 

129 

130 

131def extract_cls_template( 

132 wxr: WiktextractContext, 

133 sense: Sense, 

134 t_node: TemplateNode, 

135) -> None: 

136 # https://th.wiktionary.org/wiki/แม่แบบ:cls 

137 for arg_name in itertools.count(2): 137 ↛ 143line 137 didn't jump to line 143 because the loop on line 137 didn't complete

138 if arg_name not in t_node.template_parameters: 

139 break 

140 cls = clean_node(wxr, None, t_node.template_parameters[arg_name]) 

141 if cls != "": 141 ↛ 137line 141 didn't jump to line 137 because the condition on line 141 was always true

142 sense.classifiers.append(Classifier(classifier=cls)) 

143 clean_node(wxr, sense, t_node) 

144 

145 

146def extract_th_noun_template( 

147 wxr: WiktextractContext, 

148 word_entry: WordEntry, 

149 t_node: TemplateNode, 

150) -> None: 

151 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun 

152 expanded_node = wxr.wtp.parse( 

153 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

154 ) 

155 for b_tag in expanded_node.find_html_recursively("b"): 

156 cls = clean_node(wxr, None, b_tag) 

157 if cls != "": 157 ↛ 155line 157 didn't jump to line 155 because the condition on line 157 was always true

158 word_entry.classifiers.append(Classifier(classifier=cls)) 

159 

160 clean_node(wxr, word_entry, expanded_node) 

161 

162 

163def extract_th_verb_adj_template( 

164 wxr: WiktextractContext, 

165 word_entry: WordEntry, 

166 t_node: TemplateNode, 

167) -> None: 

168 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun 

169 # https://th.wiktionary.org/wiki/แม่แบบ:th-adj 

170 expanded_node = wxr.wtp.parse( 

171 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

172 ) 

173 for b_tag in expanded_node.find_html_recursively("b"): 

174 form_str = clean_node(wxr, None, b_tag) 

175 if form_str != "": 175 ↛ 173line 175 didn't jump to line 173 because the condition on line 175 was always true

176 word_entry.forms.append( 

177 Form( 

178 form=form_str, 

179 tags=[ 

180 "abstract-noun" 

181 if t_node.template_name == "th-verb" 

182 else "noun-from-adj" 

183 ], 

184 ) 

185 ) 

186 

187 clean_node(wxr, word_entry, expanded_node) 

188 

189 

190def extract_note_section( 

191 wxr: WiktextractContext, 

192 word_entry: WordEntry, 

193 level_node: LevelNode, 

194) -> None: 

195 for list_node in level_node.find_child(NodeKind.LIST): 

196 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

197 note_str = clean_node( 

198 wxr, 

199 word_entry, 

200 list(list_item.invert_find_child(NodeKind.LIST)), 

201 ) 

202 if note_str != "": 

203 word_entry.notes.append(note_str) 

204 

205 

206def extract_form_of_template( 

207 wxr: WiktextractContext, 

208 word_entry: WordEntry, 

209 first_sense: Sense, 

210 t_node: TemplateNode, 

211) -> None: 

212 form = AltForm(word="") 

213 expanded_node = wxr.wtp.parse( 

214 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

215 ) 

216 senses = [] 

217 if expanded_node.contain_node(NodeKind.LIST): 

218 first_list_idx = len(expanded_node.children) 

219 first_gloss = "" 

220 for index, node in enumerate(expanded_node.children): 

221 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

222 if index < first_list_idx: 222 ↛ 230line 222 didn't jump to line 230 because the condition on line 222 was always true

223 first_list_idx = index 

224 first_gloss = clean_node( 

225 wxr, first_sense, expanded_node.children[:index] 

226 ) 

227 if first_gloss != "": 227 ↛ 230line 227 didn't jump to line 230 because the condition on line 227 was always true

228 first_sense.glosses.append(first_gloss) 

229 senses.append(first_sense) 

230 for list_item in node.find_child(NodeKind.LIST_ITEM): 

231 sense = Sense() 

232 if first_gloss != "": 232 ↛ 234line 232 didn't jump to line 234 because the condition on line 232 was always true

233 sense.glosses.append(first_gloss) 

234 gloss = clean_node(wxr, sense, list_item.children) 

235 if gloss != "": 235 ↛ 230line 235 didn't jump to line 230 because the condition on line 235 was always true

236 sense.glosses.append(gloss) 

237 senses.append(sense) 

238 else: 

239 gloss = clean_node(wxr, first_sense, expanded_node) 

240 if gloss != "": 240 ↛ 244line 240 didn't jump to line 244 because the condition on line 240 was always true

241 first_sense.glosses.append(gloss) 

242 senses.append(first_sense) 

243 

244 for i_tag in expanded_node.find_html_recursively("i"): 244 ↛ 247line 244 didn't jump to line 247 because the loop on line 244 didn't complete

245 form.word = clean_node(wxr, None, i_tag) 

246 break 

247 for span_tag in expanded_node.find_html_recursively("span"): 

248 if "mention-tr" in span_tag.attrs.get("class", ""): 

249 form.roman = clean_node(wxr, None, span_tag) 

250 break 

251 is_alt_of = ( 

252 t_node.template_name.startswith(("alternative ", "alternate ")) 

253 or t_node.template_name in ALT_OF_TEMPLATES 

254 ) 

255 if form.word != "": 255 ↛ 265line 255 didn't jump to line 265 because the condition on line 255 was always true

256 for sense in senses: 

257 if is_alt_of: 

258 sense.alt_of.append(form) 

259 else: 

260 sense.form_of.append(form) 

261 if is_alt_of and "alt-of" not in sense.tags: 

262 sense.tags.append("alt-of") 

263 if not is_alt_of and "form-of" not in sense.tags: 

264 sense.tags.append("form-of") 

265 word_entry.senses.extend(senses) 

266 

267 

268def extract_usage_note_section( 

269 wxr: WiktextractContext, 

270 word_entry: WordEntry, 

271 level_node: LevelNode, 

272) -> None: 

273 for list_node in level_node.find_child(NodeKind.LIST): 

274 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

275 note_str = clean_node(wxr, None, list_item.children) 

276 if note_str != "": 

277 word_entry.notes.append(note_str) 

278 

279 

280def extract_zh_mw_template( 

281 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

282) -> None: 

283 # Chinese inline classifier template 

284 # copied from zh edition code 

285 expanded_node = wxr.wtp.parse( 

286 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

287 ) 

288 classifiers = [] 

289 last_word = "" 

290 for span_tag in expanded_node.find_html_recursively("span"): 

291 span_class = span_tag.attrs.get("class", "") 

292 if span_class in ["Hani", "Hant", "Hans"]: 

293 word = clean_node(wxr, None, span_tag) 

294 if word != "/": 

295 classifier = Classifier(classifier=word) 

296 if span_class == "Hant": 

297 classifier.tags.append("Traditional-Chinese") 

298 elif span_class == "Hans": 

299 classifier.tags.append("Simplified-Chinese") 

300 

301 if len(classifiers) > 0 and last_word != "/": 

302 sense.classifiers.extend(classifiers) 

303 classifiers.clear() 

304 classifiers.append(classifier) 

305 last_word = word 

306 elif "title" in span_tag.attrs: 

307 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

308 if len(raw_tag) > 0: 

309 for classifier in classifiers: 

310 classifier.raw_tags.append(raw_tag) 

311 sense.classifiers.extend(classifiers) 

312 for classifier in sense.classifiers: 

313 translate_raw_tags(classifier) 

314 

315 

316def extract_headword_line_template( 

317 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

318): 

319 expanded_node = wxr.wtp.parse( 

320 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

321 ) 

322 for main_span_tag in expanded_node.find_html( 

323 "span", attr_name="class", attr_value="headword-line" 

324 ): 

325 for strong_tag in main_span_tag.find_html( 

326 "strong", attr_name="class", attr_value="headword" 

327 ): 

328 strong_str = clean_node(wxr, None, strong_tag) 

329 if strong_str not in ["", wxr.wtp.title]: 329 ↛ 325line 329 didn't jump to line 325 because the condition on line 329 was always true

330 word_entry.forms.append( 

331 Form(form=strong_str, tags=["canonical"]) 

332 ) 

333 for roman_span in main_span_tag.find_html( 

334 "span", attr_name="class", attr_value="headword-tr" 

335 ): 

336 roman = clean_node(wxr, None, roman_span) 

337 if roman != "": 337 ↛ 333line 337 didn't jump to line 333 because the condition on line 337 was always true

338 word_entry.forms.append( 

339 Form(form=roman, tags=["transliteration"]) 

340 ) 

341 for gender_span in main_span_tag.find_html( 

342 "span", attr_name="class", attr_value="gender" 

343 ): 

344 for abbr_tag in gender_span.find_html("abbr"): 

345 word_entry.raw_tags.append(clean_node(wxr, None, abbr_tag)) 

346 form_raw_tag = "" 

347 for html_tag in main_span_tag.find_child(NodeKind.HTML): 

348 if html_tag.tag == "i": 

349 form_raw_tag = clean_node(wxr, None, html_tag) 

350 elif html_tag.tag == "b": 

351 form_str = clean_node(wxr, None, html_tag) 

352 if form_str != "": 352 ↛ 347line 352 didn't jump to line 347 because the condition on line 352 was always true

353 form = Form(form=form_str) 

354 if form_raw_tag != "": 354 ↛ 357line 354 didn't jump to line 357 because the condition on line 354 was always true

355 form.raw_tags.append(form_raw_tag) 

356 translate_raw_tags(form) 

357 word_entry.forms.append(form) 

358 

359 clean_node(wxr, word_entry, expanded_node) 

360 translate_raw_tags(word_entry)