Coverage for src / wiktextract / extractor / th / page.py: 45%

180 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2import string 

3from typing import Any 

4 

5from mediawiki_langcodes import name_to_code 

6from wikitextprocessor.parser import ( 

7 LEVEL_KIND_FLAGS, 

8 HTMLNode, 

9 LevelNode, 

10 NodeKind, 

11 TemplateNode, 

12 WikiNode, 

13) 

14 

15from ...page import clean_node 

16from ...wxr_context import WiktextractContext 

17from .alt_form import extract_alt_form_section, extract_romanization_section 

18from .descendant import extract_descendant_section 

19from .etymology import extract_etymology_section, extract_ja_kanjitab_template 

20from .linkage import extract_linkage_section 

21from .models import Form, Linkage, Sense, WordEntry 

22from .pos import ( 

23 extract_note_section, 

24 extract_pos_section, 

25 extract_usage_note_section, 

26) 

27from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS 

28from .sound import extract_sound_section 

29from .tags import translate_raw_tags 

30from .translation import extract_translation_section 

31 

32 

33def parse_section( 

34 wxr: WiktextractContext, 

35 page_data: list[WordEntry], 

36 base_data: WordEntry, 

37 level_node: LevelNode, 

38) -> None: 

39 title_text = clean_node(wxr, None, level_node.largs) 

40 title_text = title_text.rstrip(string.digits + string.whitespace) 

41 wxr.wtp.start_subsection(title_text) 

42 if title_text in POS_DATA: 

43 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

44 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS: 

45 page_data.pop() 

46 extract_linkage_section( 

47 wxr, 

48 page_data[-1] if len(page_data) > 0 else base_data, 

49 level_node, 

50 LINKAGE_SECTIONS[title_text], 

51 ) 

52 elif ( 

53 len(page_data[-1].senses) == 0 and title_text == "การถอดเป็นอักษรโรมัน" 

54 ): 

55 page_data.pop() 

56 extract_romanization_section( 

57 wxr, 

58 page_data[-1] if len(page_data) > 0 else base_data, 

59 level_node, 

60 ) 

61 elif title_text == "รากศัพท์": 

62 if level_node.contain_node(LEVEL_KIND_FLAGS): 

63 base_data = base_data.model_copy(deep=True) 

64 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

65 if t_node.template_name in ["ja-see", "ja-see-kango"]: 

66 base_data = base_data.model_copy(deep=True) 

67 extract_ja_see_template(wxr, base_data, t_node) 

68 if len(base_data.redirects) > 0: 68 ↛ 64line 68 didn't jump to line 64 because the condition on line 68 was always true

69 page_data.append(base_data) 

70 extract_etymology_section(wxr, base_data, level_node) 

71 elif title_text in TRANSLATION_SECTIONS: 

72 extract_translation_section( 

73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

74 ) 

75 elif title_text in LINKAGE_SECTIONS: 

76 extract_linkage_section( 

77 wxr, 

78 page_data[-1] if len(page_data) > 0 else base_data, 

79 level_node, 

80 LINKAGE_SECTIONS[title_text], 

81 ) 

82 elif title_text == "คำสืบทอด": 

83 extract_descendant_section( 

84 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

85 ) 

86 elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง", "ออกเสียง")): 

87 extract_sound_section(wxr, base_data, level_node) 

88 elif title_text == "รูปแบบอื่น": 88 ↛ 98line 88 didn't jump to line 98 because the condition on line 88 was always true

89 extract_alt_form_section( 

90 wxr, 

91 page_data[-1] 

92 if len(page_data) > 0 

93 and page_data[-1].lang_code == base_data.lang_code 

94 and page_data[-1].pos == base_data.pos 

95 else base_data, 

96 level_node, 

97 ) 

98 elif title_text == "การใช้": 

99 extract_note_section( 

100 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

101 ) 

102 elif title_text == "หมายเหตุการใช้": 

103 extract_usage_note_section( 

104 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

105 ) 

106 elif title_text not in [ 

107 "ดูเพิ่ม", # see more 

108 "อ้างอิง", # references 

109 "อ่านเพิ่ม", # read more 

110 "อ่านเพิ่มเติม", # read more 

111 "รากอักขระ", # glyph origin 

112 "การผันรูป", # conjugation 

113 "การผัน", # conjugation 

114 "คำกริยาในรูปต่าง ๆ", # verb forms 

115 "การอ่าน", # Japanese readings 

116 "การผันคำกริยา", # conjugation 

117 "การผันคำ", # inflection 

118 "การกลายรูป", # conjugation 

119 "การผันคำนาม", # inflection 

120 ]: 

121 wxr.wtp.debug(f"Unknown title: {title_text}", sortid="th/page/106") 

122 

123 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

124 parse_section(wxr, page_data, base_data, next_level) 

125 

126 extract_category_templates( 

127 wxr, page_data if len(page_data) else [base_data], level_node 

128 ) 

129 

130 

131def parse_page( 

132 wxr: WiktextractContext, page_title: str, page_text: str 

133) -> list[dict[str, Any]]: 

134 # page layout 

135 # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน 

136 

137 # skip translation pages 

138 if page_title.endswith("/คำแปลภาษาอื่น"): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 return [] 

140 wxr.wtp.start_page(page_title) 

141 tree = wxr.wtp.parse(page_text, pre_expand=True) 

142 page_data: list[WordEntry] = [] 

143 for level2_node in tree.find_child(NodeKind.LEVEL2): 

144 lang_name = clean_node(wxr, None, level2_node.largs) 

145 lang_name = lang_name.removeprefix("ภาษา") 

146 lang_code = name_to_code(lang_name, "th") 

147 if lang_code == "": 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 lang_code = "unknown" 

149 if lang_name == "": 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 lang_name = "unknown" 

151 wxr.wtp.start_section(lang_name) 

152 base_data = WordEntry( 

153 word=wxr.wtp.title, 

154 lang_code=lang_code, 

155 lang=lang_name, 

156 pos="unknown", 

157 ) 

158 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 158 ↛ 159line 158 didn't jump to line 159 because the loop on line 158 never started

159 if t_node.template_name == "zh-forms": 

160 extract_zh_forms(wxr, base_data, t_node) 

161 elif t_node.template_name == "zh-see": 

162 base_data.redirects.append( 

163 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

164 ) 

165 clean_node(wxr, base_data, t_node) 

166 elif t_node.template_name in ["ja-see", "ja-see-kango"]: 

167 extract_ja_see_template(wxr, base_data, t_node) 

168 elif ( 

169 t_node.template_name.endswith("-kanjitab") 

170 or t_node.template_name == "ja-kt" 

171 ): 

172 extract_ja_kanjitab_template(wxr, t_node, base_data) 

173 

174 if len(base_data.redirects) > 0: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 page_data.append(base_data) 

176 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

177 parse_section(wxr, page_data, base_data, next_level_node) 

178 

179 for data in page_data: 

180 if len(data.senses) == 0: 

181 data.senses.append(Sense(tags=["no-gloss"])) 

182 return [m.model_dump(exclude_defaults=True) for m in page_data] 

183 

184 

185CATEGORY_TEMPLATES = frozenset( 

186 [ 

187 "zh-cat", 

188 "cln", 

189 "catlangname", 

190 "c", 

191 "topics", 

192 "top", 

193 "catlangcode", 

194 "topic", 

195 ] 

196) 

197 

198 

199def extract_category_templates( 

200 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode 

201): 

202 categories = {} 

203 for node in level_node.find_child(NodeKind.TEMPLATE): 

204 if node.template_name.lower() in CATEGORY_TEMPLATES: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 clean_node(wxr, categories, node) 

206 for data in page_data: 

207 if data.lang_code == page_data[-1].lang_code: 

208 data.categories.extend(categories.get("categories", [])) 

209 

210 

211def extract_zh_forms( 

212 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

213): 

214 base_data.literal_meaning = clean_node( 

215 wxr, None, t_node.template_parameters.get("lit", "") 

216 ) 

217 expanded_node = wxr.wtp.parse( 

218 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

219 ) 

220 for table in expanded_node.find_child(NodeKind.TABLE): 

221 for row in table.find_child(NodeKind.TABLE_ROW): 

222 row_header = "" 

223 row_header_tags = [] 

224 header_has_span = False 

225 for cell in row.find_child( 

226 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

227 ): 

228 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

229 row_header, row_header_tags, header_has_span = ( 

230 extract_zh_forms_header_cell(wxr, base_data, cell) 

231 ) 

232 elif not header_has_span: 

233 extract_zh_forms_data_cell( 

234 wxr, base_data, cell, row_header, row_header_tags 

235 ) 

236 

237 

238def extract_zh_forms_header_cell( 

239 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode 

240) -> tuple[str, list[str], bool]: 

241 row_header = "" 

242 row_header_tags = [] 

243 header_has_span = False 

244 first_span_index = len(header_cell.children) 

245 for index, span_tag in header_cell.find_html("span", with_index=True): 

246 if index < first_span_index: 

247 first_span_index = index 

248 header_has_span = True 

249 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

250 for raw_tag in re.split(r"/|และ", row_header): 

251 raw_tag = raw_tag.strip() 

252 if raw_tag != "": 

253 row_header_tags.append(raw_tag) 

254 for span_tag in header_cell.find_html_recursively("span"): 

255 span_lang = span_tag.attrs.get("lang", "") 

256 form_nodes = [] 

257 sup_title = "" 

258 for node in span_tag.children: 

259 if isinstance(node, HTMLNode) and node.tag == "sup": 

260 for sup_span in node.find_html("span"): 

261 sup_title = sup_span.attrs.get("title", "") 

262 else: 

263 form_nodes.append(node) 

264 if span_lang in ["zh-Hant", "zh-Hans"]: 

265 for word in clean_node(wxr, None, form_nodes).split("/"): 

266 if word not in [base_data.word, ""]: 

267 form = Form(form=word, raw_tags=row_header_tags) 

268 if sup_title != "": 

269 form.raw_tags.append(sup_title) 

270 translate_raw_tags(form) 

271 base_data.forms.append(form) 

272 return row_header, row_header_tags, header_has_span 

273 

274 

275def extract_zh_forms_data_cell( 

276 wxr: WiktextractContext, 

277 base_data: WordEntry, 

278 cell: WikiNode, 

279 row_header: str, 

280 row_header_tags: list[str], 

281): 

282 forms = [] 

283 for top_span_tag in cell.find_html("span"): 

284 span_style = top_span_tag.attrs.get("style", "") 

285 span_lang = top_span_tag.attrs.get("lang", "") 

286 if span_style == "white-space:nowrap;": 

287 extract_zh_forms_data_cell( 

288 wxr, base_data, top_span_tag, row_header, row_header_tags 

289 ) 

290 elif "font-size:80%" in span_style: 

291 raw_tag = clean_node(wxr, None, top_span_tag) 

292 if raw_tag != "": 

293 for form in forms: 

294 form.raw_tags.append(raw_tag) 

295 translate_raw_tags(form) 

296 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 

297 word = clean_node(wxr, None, top_span_tag) 

298 if word not in ["", "/", base_data.word]: 

299 form = Form(form=word) 

300 if row_header != "anagram": 

301 form.raw_tags = row_header_tags 

302 if span_lang == "zh-Hant": 

303 form.tags.append("Traditional-Chinese") 

304 elif span_lang == "zh-Hans": 

305 form.tags.append("Simplified-Chinese") 

306 translate_raw_tags(form) 

307 forms.append(form) 

308 

309 if row_header == "anagram": 

310 for form in forms: 

311 base_data.anagrams.append( 

312 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags) 

313 ) 

314 else: 

315 base_data.forms.extend(forms) 

316 

317 

318def extract_ja_see_template( 

319 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

320): 

321 for key, value in t_node.template_parameters.items(): 

322 if isinstance(key, int): 322 ↛ 321line 322 didn't jump to line 321 because the condition on line 322 was always true

323 base_data.redirects.append(clean_node(wxr, None, value)) 

324 clean_node(wxr, base_data, t_node)