Coverage for src/wiktextract/extractor/th/pos.py: 75%

204 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import itertools 

2import re 

3 

4from wikitextprocessor import ( 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .example import extract_example_list_item 

14from .models import AltForm, Classifier, Form, Sense, WordEntry 

15from .section_titles import POS_DATA 

16from .tags import translate_raw_tags 

17 

18 

19def extract_pos_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24 pos_title: str, 

25) -> None: 

26 page_data.append(base_data.model_copy(deep=True)) 

27 page_data[-1].pos_title = pos_title 

28 pos_data = POS_DATA[pos_title] 

29 page_data[-1].pos = pos_data["pos"] 

30 base_data.pos = pos_data["pos"] 

31 page_data[-1].tags.extend(pos_data.get("tags", [])) 

32 

33 gloss_list_index = len(level_node.children) 

34 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

35 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

36 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 

37 extract_gloss_list_item(wxr, page_data[-1], list_item) 

38 if index < gloss_list_index: 38 ↛ 35line 38 didn't jump to line 35 because the condition on line 38 was always true

39 gloss_list_index = index 

40 

41 for node in level_node.children[:gloss_list_index]: 

42 if isinstance(node, TemplateNode) and node.template_name == "th-noun": 

43 extract_th_noun_template(wxr, page_data[-1], node) 

44 elif isinstance(node, TemplateNode) and node.template_name in [ 

45 "th-verb", 

46 "th-adj", 

47 ]: 

48 extract_th_verb_adj_template(wxr, page_data[-1], node) 

49 elif isinstance(node, TemplateNode): 

50 extract_headword_line_template(wxr, page_data[-1], node) 

51 

52 

53# redirect 

54ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "alt sp", "altsp"]) 

55FORM_OF_TEMPLATES = frozenset(["อักษรย่อ", "คำย่อ"]) 

56 

57 

58def extract_gloss_list_item( 

59 wxr: WiktextractContext, 

60 word_entry: WordEntry, 

61 list_item: WikiNode, 

62 parent_sense: Sense | None = None, 

63) -> None: 

64 sense = ( 

65 parent_sense.model_copy(deep=True) 

66 if parent_sense is not None 

67 else Sense() 

68 ) 

69 gloss_nodes = [] 

70 has_form_of_template = False 

71 for node in list_item.children: 

72 if isinstance(node, TemplateNode) and node.template_name in [ 

73 "label", 

74 "lb", 

75 "lbl", 

76 ]: 

77 extract_label_template(wxr, sense, node) 

78 elif isinstance(node, TemplateNode) and node.template_name == "cls": 

79 extract_cls_template(wxr, sense, node) 

80 elif isinstance(node, TemplateNode) and ( 

81 node.template_name.endswith(" of") 

82 or node.template_name.startswith("alternate ") 

83 or node.template_name in ALT_OF_TEMPLATES 

84 or node.template_name in FORM_OF_TEMPLATES 

85 ): 

86 extract_form_of_template(wxr, word_entry, sense, node) 

87 has_form_of_template = True 

88 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 extract_zh_mw_template(wxr, node, sense) 

90 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

91 gloss_nodes.append(node) 

92 

93 if not has_form_of_template: 

94 gloss_str = clean_node(wxr, sense, gloss_nodes) 

95 if gloss_str != "": 95 ↛ 100line 95 didn't jump to line 100 because the condition on line 95 was always true

96 sense.glosses.append(gloss_str) 

97 translate_raw_tags(sense) 

98 word_entry.senses.append(sense) 

99 

100 for child_list in list_item.find_child(NodeKind.LIST): 

101 if child_list.sarg.startswith("#") and child_list.sarg.endswith( 

102 (":", "*") 

103 ): 

104 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

105 extract_example_list_item(wxr, word_entry, sense, e_list_item) 

106 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 106 ↛ 100line 106 didn't jump to line 100 because the condition on line 106 was always true

107 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

108 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

109 

110 

111def extract_label_template( 

112 wxr: WiktextractContext, 

113 sense: Sense, 

114 t_node: TemplateNode, 

115) -> None: 

116 # https://th.wiktionary.org/wiki/แม่แบบ:label 

117 expanded_node = wxr.wtp.parse( 

118 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

119 ) 

120 for span_tag in expanded_node.find_html_recursively( 

121 "span", attr_name="class", attr_value="ib-content" 

122 ): 

123 span_str = clean_node(wxr, None, span_tag) 

124 for raw_tag in re.split(r",| หรือ ", span_str): 

125 raw_tag = raw_tag.strip() 

126 if raw_tag != "": 126 ↛ 124line 126 didn't jump to line 124 because the condition on line 126 was always true

127 sense.raw_tags.append(raw_tag) 

128 clean_node(wxr, sense, expanded_node) 

129 

130 

131def extract_cls_template( 

132 wxr: WiktextractContext, 

133 sense: Sense, 

134 t_node: TemplateNode, 

135) -> None: 

136 # https://th.wiktionary.org/wiki/แม่แบบ:cls 

137 for arg_name in itertools.count(2): 137 ↛ 143line 137 didn't jump to line 143 because the loop on line 137 didn't complete

138 if arg_name not in t_node.template_parameters: 

139 break 

140 cls = clean_node(wxr, None, t_node.template_parameters[arg_name]) 

141 if cls != "": 141 ↛ 137line 141 didn't jump to line 137 because the condition on line 141 was always true

142 sense.classifiers.append(Classifier(classifier=cls)) 

143 clean_node(wxr, sense, t_node) 

144 

145 

146def extract_th_noun_template( 

147 wxr: WiktextractContext, 

148 word_entry: WordEntry, 

149 t_node: TemplateNode, 

150) -> None: 

151 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun 

152 expanded_node = wxr.wtp.parse( 

153 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

154 ) 

155 for b_tag in expanded_node.find_html_recursively("b"): 

156 cls = clean_node(wxr, None, b_tag) 

157 if cls != "": 157 ↛ 155line 157 didn't jump to line 155 because the condition on line 157 was always true

158 word_entry.classifiers.append(Classifier(classifier=cls)) 

159 

160 clean_node(wxr, word_entry, expanded_node) 

161 

162 

163def extract_th_verb_adj_template( 

164 wxr: WiktextractContext, 

165 word_entry: WordEntry, 

166 t_node: TemplateNode, 

167) -> None: 

168 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun 

169 # https://th.wiktionary.org/wiki/แม่แบบ:th-adj 

170 expanded_node = wxr.wtp.parse( 

171 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

172 ) 

173 for b_tag in expanded_node.find_html_recursively("b"): 

174 form_str = clean_node(wxr, None, b_tag) 

175 if form_str != "": 175 ↛ 173line 175 didn't jump to line 173 because the condition on line 175 was always true

176 word_entry.forms.append( 

177 Form( 

178 form=form_str, 

179 tags=[ 

180 "abstract-noun" 

181 if t_node.template_name == "th-verb" 

182 else "noun-from-adj" 

183 ], 

184 ) 

185 ) 

186 

187 clean_node(wxr, word_entry, expanded_node) 

188 

189 

190def extract_note_section( 

191 wxr: WiktextractContext, 

192 word_entry: WordEntry, 

193 level_node: LevelNode, 

194) -> None: 

195 for list_node in level_node.find_child(NodeKind.LIST): 

196 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

197 note_str = clean_node( 

198 wxr, 

199 word_entry, 

200 list( 

201 list_item.invert_find_child( 

202 NodeKind.LIST, include_empty_str=True 

203 ) 

204 ), 

205 ) 

206 if note_str != "": 

207 word_entry.notes.append(note_str) 

208 

209 

210def extract_form_of_template( 

211 wxr: WiktextractContext, 

212 word_entry: WordEntry, 

213 first_sense: Sense, 

214 t_node: TemplateNode, 

215) -> None: 

216 form = AltForm(word="") 

217 expanded_node = wxr.wtp.parse( 

218 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

219 ) 

220 senses = [] 

221 if expanded_node.contain_node(NodeKind.LIST): 

222 first_list_idx = len(expanded_node.children) 

223 first_gloss = "" 

224 for index, node in enumerate(expanded_node.children): 

225 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

226 if index < first_list_idx: 226 ↛ 234line 226 didn't jump to line 234 because the condition on line 226 was always true

227 first_list_idx = index 

228 first_gloss = clean_node( 

229 wxr, first_sense, expanded_node.children[:index] 

230 ) 

231 if first_gloss != "": 231 ↛ 234line 231 didn't jump to line 234 because the condition on line 231 was always true

232 first_sense.glosses.append(first_gloss) 

233 senses.append(first_sense) 

234 for list_item in node.find_child(NodeKind.LIST_ITEM): 

235 sense = Sense() 

236 if first_gloss != "": 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true

237 sense.glosses.append(first_gloss) 

238 gloss = clean_node(wxr, sense, list_item.children) 

239 if gloss != "": 239 ↛ 234line 239 didn't jump to line 234 because the condition on line 239 was always true

240 sense.glosses.append(gloss) 

241 senses.append(sense) 

242 else: 

243 gloss = clean_node(wxr, first_sense, expanded_node) 

244 if gloss != "": 244 ↛ 248line 244 didn't jump to line 248 because the condition on line 244 was always true

245 first_sense.glosses.append(gloss) 

246 senses.append(first_sense) 

247 

248 for i_tag in expanded_node.find_html_recursively("i"): 248 ↛ 251line 248 didn't jump to line 251 because the loop on line 248 didn't complete

249 form.word = clean_node(wxr, None, i_tag) 

250 break 

251 for span_tag in expanded_node.find_html_recursively("span"): 

252 if "mention-tr" in span_tag.attrs.get("class", ""): 

253 form.roman = clean_node(wxr, None, span_tag) 

254 break 

255 is_alt_of = ( 

256 t_node.template_name.startswith(("alternative ", "alternate ")) 

257 or t_node.template_name in ALT_OF_TEMPLATES 

258 ) 

259 if form.word != "": 259 ↛ 269line 259 didn't jump to line 269 because the condition on line 259 was always true

260 for sense in senses: 

261 if is_alt_of: 

262 sense.alt_of.append(form) 

263 else: 

264 sense.form_of.append(form) 

265 if is_alt_of and "alt-of" not in sense.tags: 

266 sense.tags.append("alt-of") 

267 if not is_alt_of and "form-of" not in sense.tags: 

268 sense.tags.append("form-of") 

269 word_entry.senses.extend(senses) 

270 

271 

272def extract_usage_note_section( 

273 wxr: WiktextractContext, 

274 word_entry: WordEntry, 

275 level_node: LevelNode, 

276) -> None: 

277 for list_node in level_node.find_child(NodeKind.LIST): 

278 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

279 note_str = clean_node(wxr, None, list_item.children) 

280 if note_str != "": 

281 word_entry.notes.append(note_str) 

282 

283 

284def extract_zh_mw_template( 

285 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

286) -> None: 

287 # Chinese inline classifier template 

288 # copied from zh edition code 

289 expanded_node = wxr.wtp.parse( 

290 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

291 ) 

292 classifiers = [] 

293 last_word = "" 

294 for span_tag in expanded_node.find_html_recursively("span"): 

295 span_class = span_tag.attrs.get("class", "") 

296 if span_class in ["Hani", "Hant", "Hans"]: 

297 word = clean_node(wxr, None, span_tag) 

298 if word != "/": 

299 classifier = Classifier(classifier=word) 

300 if span_class == "Hant": 

301 classifier.tags.append("Traditional-Chinese") 

302 elif span_class == "Hans": 

303 classifier.tags.append("Simplified-Chinese") 

304 

305 if len(classifiers) > 0 and last_word != "/": 

306 sense.classifiers.extend(classifiers) 

307 classifiers.clear() 

308 classifiers.append(classifier) 

309 last_word = word 

310 elif "title" in span_tag.attrs: 

311 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

312 if len(raw_tag) > 0: 

313 for classifier in classifiers: 

314 classifier.raw_tags.append(raw_tag) 

315 sense.classifiers.extend(classifiers) 

316 for classifier in sense.classifiers: 

317 translate_raw_tags(classifier) 

318 

319 

320def extract_headword_line_template( 

321 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

322): 

323 expanded_node = wxr.wtp.parse( 

324 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

325 ) 

326 for main_span_tag in expanded_node.find_html( 

327 "span", attr_name="class", attr_value="headword-line" 

328 ): 

329 for strong_tag in main_span_tag.find_html( 

330 "strong", attr_name="class", attr_value="headword" 

331 ): 

332 strong_str = clean_node(wxr, None, strong_tag) 

333 if strong_str not in ["", wxr.wtp.title]: 333 ↛ 329line 333 didn't jump to line 329 because the condition on line 333 was always true

334 word_entry.forms.append( 

335 Form(form=strong_str, tags=["canonical"]) 

336 ) 

337 for roman_span in main_span_tag.find_html( 

338 "span", attr_name="class", attr_value="headword-tr" 

339 ): 

340 roman = clean_node(wxr, None, roman_span) 

341 if roman != "": 341 ↛ 337line 341 didn't jump to line 337 because the condition on line 341 was always true

342 word_entry.forms.append( 

343 Form(form=roman, tags=["transliteration"]) 

344 ) 

345 for gender_span in main_span_tag.find_html( 

346 "span", attr_name="class", attr_value="gender" 

347 ): 

348 for abbr_tag in gender_span.find_html("abbr"): 

349 word_entry.raw_tags.append(clean_node(wxr, None, abbr_tag)) 

350 form_raw_tag = "" 

351 for html_tag in main_span_tag.find_child(NodeKind.HTML): 

352 if html_tag.tag == "i": 

353 form_raw_tag = clean_node(wxr, None, html_tag) 

354 elif html_tag.tag == "b": 

355 form_str = clean_node(wxr, None, html_tag) 

356 if form_str != "": 356 ↛ 351line 356 didn't jump to line 351 because the condition on line 356 was always true

357 form = Form(form=form_str) 

358 if form_raw_tag != "": 358 ↛ 361line 358 didn't jump to line 361 because the condition on line 358 was always true

359 form.raw_tags.append(form_raw_tag) 

360 translate_raw_tags(form) 

361 word_entry.forms.append(form) 

362 

363 clean_node(wxr, word_entry, expanded_node) 

364 translate_raw_tags(word_entry)