Coverage for src/wiktextract/extractor/th/page.py: 45%

178 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2import string 

3from typing import Any 

4 

5from mediawiki_langcodes import name_to_code 

6from wikitextprocessor.parser import ( 

7 LEVEL_KIND_FLAGS, 

8 HTMLNode, 

9 LevelNode, 

10 NodeKind, 

11 TemplateNode, 

12 WikiNode, 

13) 

14 

15from ...page import clean_node 

16from ...wxr_context import WiktextractContext 

17from .alt_form import extract_alt_form_section, extract_romanization_section 

18from .descendant import extract_descendant_section 

19from .etymology import extract_etymology_section 

20from .linkage import extract_linkage_section 

21from .models import Form, Linkage, Sense, WordEntry 

22from .pos import ( 

23 extract_note_section, 

24 extract_pos_section, 

25 extract_usage_note_section, 

26) 

27from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS 

28from .sound import extract_sound_section 

29from .tags import translate_raw_tags 

30from .translation import extract_translation_section 

31 

32 

33def parse_section( 

34 wxr: WiktextractContext, 

35 page_data: list[WordEntry], 

36 base_data: WordEntry, 

37 level_node: LevelNode, 

38) -> None: 

39 title_text = clean_node(wxr, None, level_node.largs) 

40 title_text = title_text.rstrip(string.digits + string.whitespace) 

41 wxr.wtp.start_subsection(title_text) 

42 if title_text in POS_DATA: 

43 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

44 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS: 

45 page_data.pop() 

46 extract_linkage_section( 

47 wxr, 

48 page_data[-1] if len(page_data) > 0 else base_data, 

49 level_node, 

50 LINKAGE_SECTIONS[title_text], 

51 ) 

52 elif ( 

53 len(page_data[-1].senses) == 0 and title_text == "การถอดเป็นอักษรโรมัน" 

54 ): 

55 page_data.pop() 

56 extract_romanization_section( 

57 wxr, 

58 page_data[-1] if len(page_data) > 0 else base_data, 

59 level_node, 

60 ) 

61 elif title_text == "รากศัพท์": 

62 if level_node.contain_node(LEVEL_KIND_FLAGS): 

63 base_data = base_data.model_copy(deep=True) 

64 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

65 if t_node.template_name in ["ja-see", "ja-see-kango"]: 

66 base_data = base_data.model_copy(deep=True) 

67 extract_ja_see_template(wxr, base_data, t_node) 

68 if len(base_data.redirects) > 0: 68 ↛ 64line 68 didn't jump to line 64 because the condition on line 68 was always true

69 page_data.append(base_data) 

70 extract_etymology_section(wxr, base_data, level_node) 

71 elif title_text in TRANSLATION_SECTIONS: 

72 extract_translation_section( 

73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

74 ) 

75 elif title_text in LINKAGE_SECTIONS: 

76 extract_linkage_section( 

77 wxr, 

78 page_data[-1] if len(page_data) > 0 else base_data, 

79 level_node, 

80 LINKAGE_SECTIONS[title_text], 

81 ) 

82 elif title_text == "คำสืบทอด": 

83 extract_descendant_section( 

84 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

85 ) 

86 elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง", "ออกเสียง")): 

87 extract_sound_section(wxr, base_data, level_node) 

88 elif title_text == "รูปแบบอื่น": 88 ↛ 98line 88 didn't jump to line 98 because the condition on line 88 was always true

89 extract_alt_form_section( 

90 wxr, 

91 page_data[-1] 

92 if len(page_data) > 0 

93 and page_data[-1].lang_code == base_data.lang_code 

94 and page_data[-1].pos == base_data.pos 

95 else base_data, 

96 level_node, 

97 ) 

98 elif title_text == "การใช้": 

99 extract_note_section( 

100 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

101 ) 

102 elif title_text == "หมายเหตุการใช้": 

103 extract_usage_note_section( 

104 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

105 ) 

106 elif title_text not in [ 

107 "ดูเพิ่ม", # see more 

108 "อ้างอิง", # references 

109 "อ่านเพิ่ม", # read more 

110 "อ่านเพิ่มเติม", # read more 

111 "รากอักขระ", # glyph origin 

112 "การผันรูป", # conjugation 

113 "การผัน", # conjugation 

114 "คำกริยาในรูปต่าง ๆ", # verb forms 

115 "การอ่าน", # Japanese readings 

116 "การผันคำกริยา", # conjugation 

117 "การผันคำ", # inflection 

118 "การกลายรูป", # conjugation 

119 "การผันคำนาม", # inflection 

120 ]: 

121 wxr.wtp.debug(f"Unknown title: {title_text}", sortid="th/page/106") 

122 

123 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

124 parse_section(wxr, page_data, base_data, next_level) 

125 

126 extract_category_templates( 

127 wxr, page_data if len(page_data) else [base_data], level_node 

128 ) 

129 

130 

131def parse_page( 

132 wxr: WiktextractContext, page_title: str, page_text: str 

133) -> list[dict[str, Any]]: 

134 # page layout 

135 # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน 

136 

137 # skip translation pages 

138 if page_title.endswith("/คำแปลภาษาอื่น"): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 return [] 

140 wxr.wtp.start_page(page_title) 

141 tree = wxr.wtp.parse(page_text, pre_expand=True) 

142 page_data: list[WordEntry] = [] 

143 for level2_node in tree.find_child(NodeKind.LEVEL2): 

144 lang_name = clean_node(wxr, None, level2_node.largs) 

145 lang_name = lang_name.removeprefix("ภาษา") 

146 lang_code = name_to_code(lang_name, "th") 

147 if lang_code == "": 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 lang_code = "unknown" 

149 if lang_name == "": 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 lang_name = "unknown" 

151 wxr.wtp.start_section(lang_name) 

152 base_data = WordEntry( 

153 word=wxr.wtp.title, 

154 lang_code=lang_code, 

155 lang=lang_name, 

156 pos="unknown", 

157 ) 

158 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 158 ↛ 159line 158 didn't jump to line 159 because the loop on line 158 never started

159 if t_node.template_name == "zh-forms": 

160 extract_zh_forms(wxr, base_data, t_node) 

161 elif t_node.template_name == "zh-see": 

162 base_data.redirects.append( 

163 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

164 ) 

165 clean_node(wxr, base_data, t_node) 

166 elif t_node.template_name in ["ja-see", "ja-see-kango"]: 

167 extract_ja_see_template(wxr, base_data, t_node) 

168 if len(base_data.redirects) > 0: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 page_data.append(base_data) 

170 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

171 parse_section(wxr, page_data, base_data, next_level_node) 

172 

173 for data in page_data: 

174 if len(data.senses) == 0: 

175 data.senses.append(Sense(tags=["no-gloss"])) 

176 return [m.model_dump(exclude_defaults=True) for m in page_data] 

177 

178 

179CATEGORY_TEMPLATES = frozenset( 

180 [ 

181 "zh-cat", 

182 "cln", 

183 "catlangname", 

184 "c", 

185 "topics", 

186 "top", 

187 "catlangcode", 

188 "topic", 

189 ] 

190) 

191 

192 

193def extract_category_templates( 

194 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode 

195): 

196 categories = {} 

197 for node in level_node.find_child(NodeKind.TEMPLATE): 

198 if node.template_name.lower() in CATEGORY_TEMPLATES: 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true

199 clean_node(wxr, categories, node) 

200 for data in page_data: 

201 if data.lang_code == page_data[-1].lang_code: 

202 data.categories.extend(categories.get("categories", [])) 

203 

204 

205def extract_zh_forms( 

206 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

207): 

208 base_data.literal_meaning = clean_node( 

209 wxr, None, t_node.template_parameters.get("lit", "") 

210 ) 

211 expanded_node = wxr.wtp.parse( 

212 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

213 ) 

214 for table in expanded_node.find_child(NodeKind.TABLE): 

215 for row in table.find_child(NodeKind.TABLE_ROW): 

216 row_header = "" 

217 row_header_tags = [] 

218 header_has_span = False 

219 for cell in row.find_child( 

220 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

221 ): 

222 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

223 row_header, row_header_tags, header_has_span = ( 

224 extract_zh_forms_header_cell(wxr, base_data, cell) 

225 ) 

226 elif not header_has_span: 

227 extract_zh_forms_data_cell( 

228 wxr, base_data, cell, row_header, row_header_tags 

229 ) 

230 

231 

232def extract_zh_forms_header_cell( 

233 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode 

234) -> tuple[str, list[str], bool]: 

235 row_header = "" 

236 row_header_tags = [] 

237 header_has_span = False 

238 first_span_index = len(header_cell.children) 

239 for index, span_tag in header_cell.find_html("span", with_index=True): 

240 if index < first_span_index: 

241 first_span_index = index 

242 header_has_span = True 

243 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

244 for raw_tag in re.split(r"/|และ", row_header): 

245 raw_tag = raw_tag.strip() 

246 if raw_tag != "": 

247 row_header_tags.append(raw_tag) 

248 for span_tag in header_cell.find_html_recursively("span"): 

249 span_lang = span_tag.attrs.get("lang", "") 

250 form_nodes = [] 

251 sup_title = "" 

252 for node in span_tag.children: 

253 if isinstance(node, HTMLNode) and node.tag == "sup": 

254 for sup_span in node.find_html("span"): 

255 sup_title = sup_span.attrs.get("title", "") 

256 else: 

257 form_nodes.append(node) 

258 if span_lang in ["zh-Hant", "zh-Hans"]: 

259 for word in clean_node(wxr, None, form_nodes).split("/"): 

260 if word not in [base_data.word, ""]: 

261 form = Form(form=word, raw_tags=row_header_tags) 

262 if sup_title != "": 

263 form.raw_tags.append(sup_title) 

264 translate_raw_tags(form) 

265 base_data.forms.append(form) 

266 return row_header, row_header_tags, header_has_span 

267 

268 

269def extract_zh_forms_data_cell( 

270 wxr: WiktextractContext, 

271 base_data: WordEntry, 

272 cell: WikiNode, 

273 row_header: str, 

274 row_header_tags: list[str], 

275): 

276 forms = [] 

277 for top_span_tag in cell.find_html("span"): 

278 span_style = top_span_tag.attrs.get("style", "") 

279 span_lang = top_span_tag.attrs.get("lang", "") 

280 if span_style == "white-space:nowrap;": 

281 extract_zh_forms_data_cell( 

282 wxr, base_data, top_span_tag, row_header, row_header_tags 

283 ) 

284 elif "font-size:80%" in span_style: 

285 raw_tag = clean_node(wxr, None, top_span_tag) 

286 if raw_tag != "": 

287 for form in forms: 

288 form.raw_tags.append(raw_tag) 

289 translate_raw_tags(form) 

290 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 

291 word = clean_node(wxr, None, top_span_tag) 

292 if word not in ["", "/", base_data.word]: 

293 form = Form(form=word) 

294 if row_header != "anagram": 

295 form.raw_tags = row_header_tags 

296 if span_lang == "zh-Hant": 

297 form.tags.append("Traditional-Chinese") 

298 elif span_lang == "zh-Hans": 

299 form.tags.append("Simplified-Chinese") 

300 translate_raw_tags(form) 

301 forms.append(form) 

302 

303 if row_header == "anagram": 

304 for form in forms: 

305 base_data.anagrams.append( 

306 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags) 

307 ) 

308 else: 

309 base_data.forms.extend(forms) 

310 

311 

312def extract_ja_see_template( 

313 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

314): 

315 for key, value in t_node.template_parameters.items(): 

316 if isinstance(key, int): 316 ↛ 315line 316 didn't jump to line 315 because the condition on line 316 was always true

317 base_data.redirects.append(clean_node(wxr, None, value)) 

318 clean_node(wxr, base_data, t_node)