Coverage for src/wiktextract/extractor/th/pos.py: 77%

1import itertools

2import re

4from wikitextprocessor import (

5 HTMLNode,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from ..ruby import extract_ruby

15from .example import extract_example_list_item

16from .models import AltForm, Classifier, Form, Sense, WordEntry

17from .section_titles import POS_DATA

18from .tags import translate_raw_tags

21def extract_pos_section(

22 wxr: WiktextractContext,

23 page_data: list[WordEntry],

24 base_data: WordEntry,

25 level_node: LevelNode,

26 pos_title: str,

27) -> None:

28 page_data.append(base_data.model_copy(deep=True))

29 page_data[-1].pos_title = pos_title

30 pos_data = POS_DATA[pos_title]

31 page_data[-1].pos = pos_data["pos"]

32 base_data.pos = pos_data["pos"]

33 page_data[-1].tags.extend(pos_data.get("tags", []))

35 gloss_list_index = len(level_node.children)

36 for index, list_node in level_node.find_child(NodeKind.LIST, True):

37 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

38 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):

39 extract_gloss_list_item(wxr, page_data[-1], list_item)

40 if index < gloss_list_index: 40 ↛ 37line 40 didn't jump to line 37 because the condition on line 40 was always true

41 gloss_list_index = index

43 for node in level_node.children[:gloss_list_index]:

44 if isinstance(node, TemplateNode) and node.template_name == "th-noun":

45 extract_th_noun_template(wxr, page_data[-1], node)

46 elif isinstance(node, TemplateNode) and node.template_name in [

47 "th-verb",

48 "th-adj",

49 ]:

50 extract_th_verb_adj_template(wxr, page_data[-1], node)

51 elif isinstance(node, TemplateNode):

52 extract_headword_line_template(wxr, page_data[-1], node)

55# redirect

56ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "alt sp", "altsp"])

57FORM_OF_TEMPLATES = frozenset(["อักษรย่อ", "คำย่อ"])

60def extract_gloss_list_item(

61 wxr: WiktextractContext,

62 word_entry: WordEntry,

63 list_item: WikiNode,

64 parent_sense: Sense | None = None,

65) -> None:

66 sense = (

67 parent_sense.model_copy(deep=True)

68 if parent_sense is not None

69 else Sense()

70 )

71 gloss_nodes = []

72 has_form_of_template = False

73 for node in list_item.children:

74 if isinstance(node, TemplateNode) and node.template_name in [

75 "label",

76 "lb",

77 "lbl",

78 "qualifier",

79 "q",

80 "qf",

81 "qual",

82 ]:

83 extract_label_template(wxr, sense, node)

84 elif isinstance(node, TemplateNode) and node.template_name == "cls":

85 extract_cls_template(wxr, sense, node)

86 elif isinstance(node, TemplateNode) and (

87 node.template_name.endswith(" of")

88 or node.template_name.startswith("alternate ")

89 or node.template_name in ALT_OF_TEMPLATES

90 or node.template_name in FORM_OF_TEMPLATES

91 ):

92 extract_form_of_template(wxr, word_entry, sense, node)

93 has_form_of_template = True

94 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 extract_zh_mw_template(wxr, node, sense)

96 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):

97 gloss_nodes.append(node)

99 if not has_form_of_template:

100 gloss_str = clean_node(wxr, sense, gloss_nodes)

101 if gloss_str != "": 101 ↛ 106line 101 didn't jump to line 106 because the condition on line 101 was always true

102 sense.glosses.append(gloss_str)

103 translate_raw_tags(sense)

104 word_entry.senses.append(sense)

105

106 for child_list in list_item.find_child(NodeKind.LIST):

107 if child_list.sarg.startswith("#") and child_list.sarg.endswith(

108 (":", "*")

109 ):

110 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):

111 extract_example_list_item(wxr, word_entry, sense, e_list_item)

112 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 112 ↛ 106line 112 didn't jump to line 106 because the condition on line 112 was always true

113 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):

114 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)

115

116

117def extract_label_template(

118 wxr: WiktextractContext,

119 sense: Sense,

120 t_node: TemplateNode,

121) -> None:

122 # https://th.wiktionary.org/wiki/แม่แบบ:label

123 expanded_node = wxr.wtp.parse(

124 wxr.wtp.node_to_wikitext(t_node), expand_all=True

125 )

126 for span_tag in expanded_node.find_html_recursively(

127 "span", attr_name="class", attr_value="ib-content"

128 ):

129 span_str = clean_node(wxr, None, span_tag)

130 for raw_tag in re.split(r",| หรือ ", span_str):

131 raw_tag = raw_tag.strip()

132 if raw_tag != "": 132 ↛ 130line 132 didn't jump to line 130 because the condition on line 132 was always true

133 sense.raw_tags.append(raw_tag)

134 clean_node(wxr, sense, expanded_node)

135

136

137def extract_cls_template(

138 wxr: WiktextractContext,

139 sense: Sense,

140 t_node: TemplateNode,

141) -> None:

142 # https://th.wiktionary.org/wiki/แม่แบบ:cls

143 for arg_name in itertools.count(2): 143 ↛ 149line 143 didn't jump to line 149 because the loop on line 143 didn't complete

144 if arg_name not in t_node.template_parameters:

145 break

146 cls = clean_node(wxr, None, t_node.template_parameters[arg_name])

147 if cls != "": 147 ↛ 143line 147 didn't jump to line 143 because the condition on line 147 was always true

148 sense.classifiers.append(Classifier(classifier=cls))

149 clean_node(wxr, sense, t_node)

150

151

152def extract_th_noun_template(

153 wxr: WiktextractContext,

154 word_entry: WordEntry,

155 t_node: TemplateNode,

156) -> None:

157 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun

158 expanded_node = wxr.wtp.parse(

159 wxr.wtp.node_to_wikitext(t_node), expand_all=True

160 )

161 for b_tag in expanded_node.find_html_recursively("b"):

162 cls = clean_node(wxr, None, b_tag)

163 if cls != "": 163 ↛ 161line 163 didn't jump to line 161 because the condition on line 163 was always true

164 word_entry.classifiers.append(Classifier(classifier=cls))

165

166 clean_node(wxr, word_entry, expanded_node)

167

168

169def extract_th_verb_adj_template(

170 wxr: WiktextractContext,

171 word_entry: WordEntry,

172 t_node: TemplateNode,

173) -> None:

174 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun

175 # https://th.wiktionary.org/wiki/แม่แบบ:th-adj

176 expanded_node = wxr.wtp.parse(

177 wxr.wtp.node_to_wikitext(t_node), expand_all=True

178 )

179 for b_tag in expanded_node.find_html_recursively("b"):

180 form_str = clean_node(wxr, None, b_tag)

181 if form_str != "": 181 ↛ 179line 181 didn't jump to line 179 because the condition on line 181 was always true

182 word_entry.forms.append(

183 Form(

184 form=form_str,

185 tags=[

186 "abstract-noun"

187 if t_node.template_name == "th-verb"

188 else "noun-from-adj"

189 ],

190 )

191 )

192

193 clean_node(wxr, word_entry, expanded_node)

194

195

196def extract_note_section(

197 wxr: WiktextractContext,

198 word_entry: WordEntry,

199 level_node: LevelNode,

200) -> None:

201 for list_node in level_node.find_child(NodeKind.LIST):

202 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

203 note_str = clean_node(

204 wxr,

205 word_entry,

206 list(

207 list_item.invert_find_child(

208 NodeKind.LIST, include_empty_str=True

209 )

210 ),

211 )

212 if note_str != "":

213 word_entry.notes.append(note_str)

214

215

216def extract_form_of_template(

217 wxr: WiktextractContext,

218 word_entry: WordEntry,

219 first_sense: Sense,

220 t_node: TemplateNode,

221) -> None:

222 form = AltForm(word="")

223 expanded_node = wxr.wtp.parse(

224 wxr.wtp.node_to_wikitext(t_node), expand_all=True

225 )

226 senses = []

227 if expanded_node.contain_node(NodeKind.LIST):

228 first_list_idx = len(expanded_node.children)

229 first_gloss = ""

230 for index, node in enumerate(expanded_node.children):

231 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

232 if index < first_list_idx: 232 ↛ 240line 232 didn't jump to line 240 because the condition on line 232 was always true

233 first_list_idx = index

234 first_gloss = clean_node(

235 wxr, first_sense, expanded_node.children[:index]

236 )

237 if first_gloss != "": 237 ↛ 240line 237 didn't jump to line 240 because the condition on line 237 was always true

238 first_sense.glosses.append(first_gloss)

239 senses.append(first_sense)

240 for list_item in node.find_child(NodeKind.LIST_ITEM):

241 sense = Sense()

242 if first_gloss != "": 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was always true

243 sense.glosses.append(first_gloss)

244 gloss = clean_node(wxr, sense, list_item.children)

245 if gloss != "": 245 ↛ 240line 245 didn't jump to line 240 because the condition on line 245 was always true

246 sense.glosses.append(gloss)

247 senses.append(sense)

248 else:

249 gloss = clean_node(wxr, first_sense, expanded_node)

250 if gloss != "": 250 ↛ 254line 250 didn't jump to line 254 because the condition on line 250 was always true

251 first_sense.glosses.append(gloss)

252 senses.append(first_sense)

253

254 for i_tag in expanded_node.find_html_recursively("i"): 254 ↛ 257line 254 didn't jump to line 257 because the loop on line 254 didn't complete

255 form.word = clean_node(wxr, None, i_tag)

256 break

257 for span_tag in expanded_node.find_html_recursively("span"):

258 if "mention-tr" in span_tag.attrs.get("class", ""):

259 form.roman = clean_node(wxr, None, span_tag)

260 break

261 is_alt_of = (

262 t_node.template_name.startswith(("alternative ", "alternate "))

263 or t_node.template_name in ALT_OF_TEMPLATES

264 )

265 if form.word != "": 265 ↛ 275line 265 didn't jump to line 275 because the condition on line 265 was always true

266 for sense in senses:

267 if is_alt_of:

268 sense.alt_of.append(form)

269 else:

270 sense.form_of.append(form)

271 if is_alt_of and "alt-of" not in sense.tags:

272 sense.tags.append("alt-of")

273 if not is_alt_of and "form-of" not in sense.tags:

274 sense.tags.append("form-of")

275 word_entry.senses.extend(senses)

276

277

278def extract_usage_note_section(

279 wxr: WiktextractContext,

280 word_entry: WordEntry,

281 level_node: LevelNode,

282) -> None:

283 for list_node in level_node.find_child(NodeKind.LIST):

284 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

285 note_str = clean_node(wxr, None, list_item.children)

286 if note_str != "":

287 word_entry.notes.append(note_str)

288

289

290def extract_zh_mw_template(

291 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense

292) -> None:

293 # Chinese inline classifier template

294 # copied from zh edition code

295 expanded_node = wxr.wtp.parse(

296 wxr.wtp.node_to_wikitext(t_node), expand_all=True

297 )

298 classifiers = []

299 last_word = ""

300 for span_tag in expanded_node.find_html_recursively("span"):

301 span_class = span_tag.attrs.get("class", "")

302 if span_class in ["Hani", "Hant", "Hans"]:

303 word = clean_node(wxr, None, span_tag)

304 if word != "／":

305 classifier = Classifier(classifier=word)

306 if span_class == "Hant":

307 classifier.tags.append("Traditional-Chinese")

308 elif span_class == "Hans":

309 classifier.tags.append("Simplified-Chinese")

310

311 if len(classifiers) > 0 and last_word != "／":

312 sense.classifiers.extend(classifiers)

313 classifiers.clear()

314 classifiers.append(classifier)

315 last_word = word

316 elif "title" in span_tag.attrs:

317 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])

318 if len(raw_tag) > 0:

319 for classifier in classifiers:

320 classifier.raw_tags.append(raw_tag)

321 sense.classifiers.extend(classifiers)

322 for classifier in sense.classifiers:

323 translate_raw_tags(classifier)

324

325

326def extract_headword_line_template(

327 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

328):

329 forms = []

330 expanded_node = wxr.wtp.parse(

331 wxr.wtp.node_to_wikitext(t_node), expand_all=True

332 )

333 for main_span_tag in expanded_node.find_html(

334 "span", attr_name="class", attr_value="headword-line"

335 ):

336 i_tags = []

337 for html_node in main_span_tag.find_child(NodeKind.HTML):

338 class_names = html_node.attrs.get("class", "").split()

339 if html_node.tag == "strong" and "headword" in class_names:

340 ruby, no_ruby = extract_ruby(wxr, html_node)

341 strong_str = clean_node(wxr, None, no_ruby)

342 if strong_str not in ["", wxr.wtp.title] or len(ruby) > 0:

343 forms.append(

344 Form(form=strong_str, tags=["canonical"], ruby=ruby)

345 )

346 elif html_node.tag == "span":

347 if "headword-tr" in class_names or "tr" in class_names:

348 roman = clean_node(wxr, None, html_node)

349 if (

350 len(forms) > 0

351 and "canonical" not in forms[-1].tags

352 and "romanization" not in forms[-1].tags

353 ):

354 forms[-1].roman = roman

355 elif roman != "": 355 ↛ 337line 355 didn't jump to line 337 because the condition on line 355 was always true

356 forms.append(Form(form=roman, tags=["romanization"]))

357 elif "gender" in class_names:

358 for abbr_tag in html_node.find_html("abbr"):

359 gender_tag = clean_node(wxr, None, abbr_tag)

360 if ( 360 ↛ 365line 360 didn't jump to line 365 because the condition on line 360 was never true

361 len(forms) > 0

362 and "canonical" not in forms[-1].tags

363 and "romanization" not in forms[-1].tags

364 ):

365 forms[-1].raw_tags.append(gender_tag)

366 else:

367 word_entry.raw_tags.append(gender_tag)

368 elif "ib-content" in class_names: 368 ↛ 369line 368 didn't jump to line 369 because the condition on line 368 was never true

369 raw_tag = clean_node(wxr, None, html_node)

370 if raw_tag != "":

371 word_entry.raw_tags.append(raw_tag)

372 elif html_node.tag == "sup" and word_entry.lang_code == "ja":

373 forms.append(extract_historical_kana(wxr, html_node))

374 elif html_node.tag == "i":

375 if len(i_tags) > 0:

376 word_entry.raw_tags.extend(i_tags)

377 i_tags.clear()

378 for i_child in html_node.children:

379 raw_tag = (

380 clean_node(wxr, None, i_child)

381 .removeprefix("^†")

382 .strip()

383 )

384 if raw_tag != "": 384 ↛ 378line 384 didn't jump to line 378 because the condition on line 384 was always true

385 i_tags.append(raw_tag)

386 elif html_node.tag == "b": 386 ↛ 337line 386 didn't jump to line 337 because the condition on line 386 was always true

387 ruby, no_ruby = extract_ruby(wxr, html_node)

388 for form_str in filter(

389 None,

390 map(str.strip, clean_node(wxr, None, no_ruby).split(",")),

391 ):

392 form = Form(form=form_str, ruby=ruby)

393 if i_tags == ["หรือ"]:

394 if len(forms) > 0: 394 ↛ 398line 394 didn't jump to line 398 because the condition on line 394 was always true

395 form.raw_tags.extend(forms[-1].raw_tags)

396 else:

397 form.raw_tags.extend(i_tags)

398 forms.append(form)

399 i_tags.clear()

400

401 if len(i_tags) > 0:

402 word_entry.raw_tags.extend(i_tags)

403 for form in forms:

404 translate_raw_tags(form)

405 word_entry.forms.extend(forms)

406 clean_node(wxr, word_entry, expanded_node)

407 translate_raw_tags(word_entry)

408

409

410def extract_historical_kana(

411 wxr: WiktextractContext, sup_node: HTMLNode

412) -> Form:

413 form = Form(form="", tags=["archaic"])

414 for strong_node in sup_node.find_html("strong"):

415 form.form = clean_node(wxr, None, strong_node)

416 for span_node in sup_node.find_html(

417 "span", attr_name="class", attr_value="tr"

418 ):

419 form.roman = clean_node(wxr, None, span_node)

420 return form

Coverage for src / wiktextract / extractor / th / pos.py: 77%

240 statements