Coverage for src/wiktextract/extractor/th/page.py: 45%

1import re

2import string

3from typing import Any

5from mediawiki_langcodes import name_to_code

6from wikitextprocessor.parser import (

7 LEVEL_KIND_FLAGS,

8 HTMLNode,

9 LevelNode,

10 NodeKind,

11 TemplateNode,

12 WikiNode,

13)

15from ...page import clean_node

16from ...wxr_context import WiktextractContext

17from .alt_form import extract_alt_form_section, extract_romanization_section

18from .descendant import extract_descendant_section

19from .etymology import extract_etymology_section, extract_ja_kanjitab_template

20from .linkage import extract_linkage_section

21from .models import Form, Linkage, Sense, WordEntry

22from .pos import (

23 extract_note_section,

24 extract_pos_section,

25 extract_usage_note_section,

26)

27from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS

28from .sound import extract_sound_section

29from .tags import translate_raw_tags

30from .translation import extract_translation_section

33def parse_section(

34 wxr: WiktextractContext,

35 page_data: list[WordEntry],

36 base_data: WordEntry,

37 level_node: LevelNode,

38) -> None:

39 title_text = clean_node(wxr, None, level_node.largs)

40 title_text = title_text.rstrip(string.digits + string.whitespace)

41 wxr.wtp.start_subsection(title_text)

42 if title_text in POS_DATA:

43 extract_pos_section(wxr, page_data, base_data, level_node, title_text)

44 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS:

45 page_data.pop()

46 extract_linkage_section(

47 wxr,

48 page_data[-1] if len(page_data) > 0 else base_data,

49 level_node,

50 LINKAGE_SECTIONS[title_text],

51 )

52 elif (

53 len(page_data[-1].senses) == 0 and title_text == "การถอดเป็นอักษรโรมัน"

54 ):

55 page_data.pop()

56 extract_romanization_section(

57 wxr,

58 page_data[-1] if len(page_data) > 0 else base_data,

59 level_node,

60 )

61 elif title_text == "รากศัพท์":

62 if level_node.contain_node(LEVEL_KIND_FLAGS):

63 base_data = base_data.model_copy(deep=True)

64 for t_node in level_node.find_child(NodeKind.TEMPLATE):

65 if t_node.template_name in ["ja-see", "ja-see-kango"]:

66 base_data = base_data.model_copy(deep=True)

67 extract_ja_see_template(wxr, base_data, t_node)

68 if len(base_data.redirects) > 0: 68 ↛ 64line 68 didn't jump to line 64 because the condition on line 68 was always true

69 page_data.append(base_data)

70 extract_etymology_section(wxr, base_data, level_node)

71 elif title_text in TRANSLATION_SECTIONS:

72 extract_translation_section(

73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

74 )

75 elif title_text in LINKAGE_SECTIONS:

76 extract_linkage_section(

77 wxr,

78 page_data[-1] if len(page_data) > 0 else base_data,

79 level_node,

80 LINKAGE_SECTIONS[title_text],

81 )

82 elif title_text == "คำสืบทอด":

83 extract_descendant_section(

84 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

85 )

86 elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง", "ออกเสียง")):

87 extract_sound_section(wxr, base_data, level_node)

88 elif title_text == "รูปแบบอื่น": 88 ↛ 98line 88 didn't jump to line 98 because the condition on line 88 was always true

89 extract_alt_form_section(

90 wxr,

91 page_data[-1]

92 if len(page_data) > 0

93 and page_data[-1].lang_code == base_data.lang_code

94 and page_data[-1].pos == base_data.pos

95 else base_data,

96 level_node,

97 )

98 elif title_text == "การใช้":

99 extract_note_section(

100 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

101 )

102 elif title_text == "หมายเหตุการใช้":

103 extract_usage_note_section(

104 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

105 )

106 elif title_text not in [

107 "ดูเพิ่ม", # see more

108 "อ้างอิง", # references

109 "อ่านเพิ่ม", # read more

110 "อ่านเพิ่มเติม", # read more

111 "รากอักขระ", # glyph origin

112 "การผันรูป", # conjugation

113 "การผัน", # conjugation

114 "คำกริยาในรูปต่าง ๆ", # verb forms

115 "การอ่าน", # Japanese readings

116 "การผันคำกริยา", # conjugation

117 "การผันคำ", # inflection

118 "การกลายรูป", # conjugation

119 "การผันคำนาม", # inflection

120 ]:

121 wxr.wtp.debug(f"Unknown title: {title_text}", sortid="th/page/106")

122

123 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):

124 parse_section(wxr, page_data, base_data, next_level)

125

126 extract_category_templates(

127 wxr, page_data if len(page_data) else [base_data], level_node

128 )

129

130

131def parse_page(

132 wxr: WiktextractContext, page_title: str, page_text: str

133) -> list[dict[str, Any]]:

134 # page layout

135 # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน

136

137 # skip translation pages

138 if page_title.endswith("/คำแปลภาษาอื่น"): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 return []

140 wxr.wtp.start_page(page_title)

141 tree = wxr.wtp.parse(page_text, pre_expand=True)

142 page_data: list[WordEntry] = []

143 for level2_node in tree.find_child(NodeKind.LEVEL2):

144 lang_name = clean_node(wxr, None, level2_node.largs)

145 lang_name = lang_name.removeprefix("ภาษา")

146 lang_code = name_to_code(lang_name, "th")

147 if lang_code == "": 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 lang_code = "unknown"

149 if lang_name == "": 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 lang_name = "unknown"

151 wxr.wtp.start_section(lang_name)

152 base_data = WordEntry(

153 word=wxr.wtp.title,

154 lang_code=lang_code,

155 lang=lang_name,

156 pos="unknown",

157 )

158 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 158 ↛ 159line 158 didn't jump to line 159 because the loop on line 158 never started

159 if t_node.template_name == "zh-forms":

160 extract_zh_forms(wxr, base_data, t_node)

161 elif t_node.template_name == "zh-see":

162 base_data.redirects.append(

163 clean_node(wxr, None, t_node.template_parameters.get(1, ""))

164 )

165 clean_node(wxr, base_data, t_node)

166 elif t_node.template_name in ["ja-see", "ja-see-kango"]:

167 extract_ja_see_template(wxr, base_data, t_node)

168 elif (

169 t_node.template_name.endswith("-kanjitab")

170 or t_node.template_name == "ja-kt"

171 ):

172 extract_ja_kanjitab_template(wxr, t_node, base_data)

173

174 if len(base_data.redirects) > 0: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 page_data.append(base_data)

176 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):

177 parse_section(wxr, page_data, base_data, next_level_node)

178

179 for data in page_data:

180 if len(data.senses) == 0:

181 data.senses.append(Sense(tags=["no-gloss"]))

182 return [m.model_dump(exclude_defaults=True) for m in page_data]

183

184

185CATEGORY_TEMPLATES = frozenset(

186 [

187 "zh-cat",

188 "cln",

189 "catlangname",

190 "c",

191 "topics",

192 "top",

193 "catlangcode",

194 "topic",

195 ]

196)

197

198

199def extract_category_templates(

200 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode

201):

202 categories = {}

203 for node in level_node.find_child(NodeKind.TEMPLATE):

204 if node.template_name.lower() in CATEGORY_TEMPLATES: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 clean_node(wxr, categories, node)

206 for data in page_data:

207 if data.lang_code == page_data[-1].lang_code:

208 data.categories.extend(categories.get("categories", []))

209

210

211def extract_zh_forms(

212 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

213):

214 base_data.literal_meaning = clean_node(

215 wxr, None, t_node.template_parameters.get("lit", "")

216 )

217 expanded_node = wxr.wtp.parse(

218 wxr.wtp.node_to_wikitext(t_node), expand_all=True

219 )

220 for table in expanded_node.find_child(NodeKind.TABLE):

221 for row in table.find_child(NodeKind.TABLE_ROW):

222 row_header = ""

223 row_header_tags = []

224 header_has_span = False

225 for cell in row.find_child(

226 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

227 ):

228 if cell.kind == NodeKind.TABLE_HEADER_CELL:

229 row_header, row_header_tags, header_has_span = (

230 extract_zh_forms_header_cell(wxr, base_data, cell)

231 )

232 elif not header_has_span:

233 extract_zh_forms_data_cell(

234 wxr, base_data, cell, row_header, row_header_tags

235 )

236

237

238def extract_zh_forms_header_cell(

239 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode

240) -> tuple[str, list[str], bool]:

241 row_header = ""

242 row_header_tags = []

243 header_has_span = False

244 first_span_index = len(header_cell.children)

245 for index, span_tag in header_cell.find_html("span", with_index=True):

246 if index < first_span_index:

247 first_span_index = index

248 header_has_span = True

249 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])

250 for raw_tag in re.split(r"/|และ", row_header):

251 raw_tag = raw_tag.strip()

252 if raw_tag != "":

253 row_header_tags.append(raw_tag)

254 for span_tag in header_cell.find_html_recursively("span"):

255 span_lang = span_tag.attrs.get("lang", "")

256 form_nodes = []

257 sup_title = ""

258 for node in span_tag.children:

259 if isinstance(node, HTMLNode) and node.tag == "sup":

260 for sup_span in node.find_html("span"):

261 sup_title = sup_span.attrs.get("title", "")

262 else:

263 form_nodes.append(node)

264 if span_lang in ["zh-Hant", "zh-Hans"]:

265 for word in clean_node(wxr, None, form_nodes).split("/"):

266 if word not in [base_data.word, ""]:

267 form = Form(form=word, raw_tags=row_header_tags)

268 if sup_title != "":

269 form.raw_tags.append(sup_title)

270 translate_raw_tags(form)

271 base_data.forms.append(form)

272 return row_header, row_header_tags, header_has_span

273

274

275def extract_zh_forms_data_cell(

276 wxr: WiktextractContext,

277 base_data: WordEntry,

278 cell: WikiNode,

279 row_header: str,

280 row_header_tags: list[str],

281):

282 forms = []

283 for top_span_tag in cell.find_html("span"):

284 span_style = top_span_tag.attrs.get("style", "")

285 span_lang = top_span_tag.attrs.get("lang", "")

286 if span_style == "white-space:nowrap;":

287 extract_zh_forms_data_cell(

288 wxr, base_data, top_span_tag, row_header, row_header_tags

289 )

290 elif "font-size:80%" in span_style:

291 raw_tag = clean_node(wxr, None, top_span_tag)

292 if raw_tag != "":

293 for form in forms:

294 form.raw_tags.append(raw_tag)

295 translate_raw_tags(form)

296 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]:

297 word = clean_node(wxr, None, top_span_tag)

298 if word not in ["", "／", base_data.word]:

299 form = Form(form=word)

300 if row_header != "anagram":

301 form.raw_tags = row_header_tags

302 if span_lang == "zh-Hant":

303 form.tags.append("Traditional-Chinese")

304 elif span_lang == "zh-Hans":

305 form.tags.append("Simplified-Chinese")

306 translate_raw_tags(form)

307 forms.append(form)

308

309 if row_header == "anagram":

310 for form in forms:

311 base_data.anagrams.append(

312 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags)

313 )

314 else:

315 base_data.forms.extend(forms)

316

317

318def extract_ja_see_template(

319 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

320):

321 for key, value in t_node.template_parameters.items():

322 if isinstance(key, int): 322 ↛ 321line 322 didn't jump to line 321 because the condition on line 322 was always true

323 base_data.redirects.append(clean_node(wxr, None, value))

324 clean_node(wxr, base_data, t_node)

Coverage for src / wiktextract / extractor / th / page.py: 45%

180 statements