Coverage for src / wiktextract / extractor / th / page.py: 45%
180 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2import string
3from typing import Any
5from mediawiki_langcodes import name_to_code
6from wikitextprocessor.parser import (
7 LEVEL_KIND_FLAGS,
8 HTMLNode,
9 LevelNode,
10 NodeKind,
11 TemplateNode,
12 WikiNode,
13)
15from ...page import clean_node
16from ...wxr_context import WiktextractContext
17from .alt_form import extract_alt_form_section, extract_romanization_section
18from .descendant import extract_descendant_section
19from .etymology import extract_etymology_section, extract_ja_kanjitab_template
20from .linkage import extract_linkage_section
21from .models import Form, Linkage, Sense, WordEntry
22from .pos import (
23 extract_note_section,
24 extract_pos_section,
25 extract_usage_note_section,
26)
27from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS
28from .sound import extract_sound_section
29from .tags import translate_raw_tags
30from .translation import extract_translation_section
33def parse_section(
34 wxr: WiktextractContext,
35 page_data: list[WordEntry],
36 base_data: WordEntry,
37 level_node: LevelNode,
38) -> None:
39 title_text = clean_node(wxr, None, level_node.largs)
40 title_text = title_text.rstrip(string.digits + string.whitespace)
41 wxr.wtp.start_subsection(title_text)
42 if title_text in POS_DATA:
43 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
44 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS:
45 page_data.pop()
46 extract_linkage_section(
47 wxr,
48 page_data[-1] if len(page_data) > 0 else base_data,
49 level_node,
50 LINKAGE_SECTIONS[title_text],
51 )
52 elif (
53 len(page_data[-1].senses) == 0 and title_text == "การถอดเป็นอักษรโรมัน"
54 ):
55 page_data.pop()
56 extract_romanization_section(
57 wxr,
58 page_data[-1] if len(page_data) > 0 else base_data,
59 level_node,
60 )
61 elif title_text == "รากศัพท์":
62 if level_node.contain_node(LEVEL_KIND_FLAGS):
63 base_data = base_data.model_copy(deep=True)
64 for t_node in level_node.find_child(NodeKind.TEMPLATE):
65 if t_node.template_name in ["ja-see", "ja-see-kango"]:
66 base_data = base_data.model_copy(deep=True)
67 extract_ja_see_template(wxr, base_data, t_node)
68 if len(base_data.redirects) > 0: 68 ↛ 64line 68 didn't jump to line 64 because the condition on line 68 was always true
69 page_data.append(base_data)
70 extract_etymology_section(wxr, base_data, level_node)
71 elif title_text in TRANSLATION_SECTIONS:
72 extract_translation_section(
73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
74 )
75 elif title_text in LINKAGE_SECTIONS:
76 extract_linkage_section(
77 wxr,
78 page_data[-1] if len(page_data) > 0 else base_data,
79 level_node,
80 LINKAGE_SECTIONS[title_text],
81 )
82 elif title_text == "คำสืบทอด":
83 extract_descendant_section(
84 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
85 )
86 elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง", "ออกเสียง")):
87 extract_sound_section(wxr, base_data, level_node)
88 elif title_text == "รูปแบบอื่น": 88 ↛ 98line 88 didn't jump to line 98 because the condition on line 88 was always true
89 extract_alt_form_section(
90 wxr,
91 page_data[-1]
92 if len(page_data) > 0
93 and page_data[-1].lang_code == base_data.lang_code
94 and page_data[-1].pos == base_data.pos
95 else base_data,
96 level_node,
97 )
98 elif title_text == "การใช้":
99 extract_note_section(
100 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
101 )
102 elif title_text == "หมายเหตุการใช้":
103 extract_usage_note_section(
104 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
105 )
106 elif title_text not in [
107 "ดูเพิ่ม", # see more
108 "อ้างอิง", # references
109 "อ่านเพิ่ม", # read more
110 "อ่านเพิ่มเติม", # read more
111 "รากอักขระ", # glyph origin
112 "การผันรูป", # conjugation
113 "การผัน", # conjugation
114 "คำกริยาในรูปต่าง ๆ", # verb forms
115 "การอ่าน", # Japanese readings
116 "การผันคำกริยา", # conjugation
117 "การผันคำ", # inflection
118 "การกลายรูป", # conjugation
119 "การผันคำนาม", # inflection
120 ]:
121 wxr.wtp.debug(f"Unknown title: {title_text}", sortid="th/page/106")
123 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
124 parse_section(wxr, page_data, base_data, next_level)
126 extract_category_templates(
127 wxr, page_data if len(page_data) else [base_data], level_node
128 )
131def parse_page(
132 wxr: WiktextractContext, page_title: str, page_text: str
133) -> list[dict[str, Any]]:
134 # page layout
135 # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน
137 # skip translation pages
138 if page_title.endswith("/คำแปลภาษาอื่น"): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 return []
140 wxr.wtp.start_page(page_title)
141 tree = wxr.wtp.parse(page_text, pre_expand=True)
142 page_data: list[WordEntry] = []
143 for level2_node in tree.find_child(NodeKind.LEVEL2):
144 lang_name = clean_node(wxr, None, level2_node.largs)
145 lang_name = lang_name.removeprefix("ภาษา")
146 lang_code = name_to_code(lang_name, "th")
147 if lang_code == "": 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 lang_code = "unknown"
149 if lang_name == "": 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 lang_name = "unknown"
151 wxr.wtp.start_section(lang_name)
152 base_data = WordEntry(
153 word=wxr.wtp.title,
154 lang_code=lang_code,
155 lang=lang_name,
156 pos="unknown",
157 )
158 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 158 ↛ 159line 158 didn't jump to line 159 because the loop on line 158 never started
159 if t_node.template_name == "zh-forms":
160 extract_zh_forms(wxr, base_data, t_node)
161 elif t_node.template_name == "zh-see":
162 base_data.redirects.append(
163 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
164 )
165 clean_node(wxr, base_data, t_node)
166 elif t_node.template_name in ["ja-see", "ja-see-kango"]:
167 extract_ja_see_template(wxr, base_data, t_node)
168 elif (
169 t_node.template_name.endswith("-kanjitab")
170 or t_node.template_name == "ja-kt"
171 ):
172 extract_ja_kanjitab_template(wxr, t_node, base_data)
174 if len(base_data.redirects) > 0: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true
175 page_data.append(base_data)
176 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
177 parse_section(wxr, page_data, base_data, next_level_node)
179 for data in page_data:
180 if len(data.senses) == 0:
181 data.senses.append(Sense(tags=["no-gloss"]))
182 return [m.model_dump(exclude_defaults=True) for m in page_data]
185CATEGORY_TEMPLATES = frozenset(
186 [
187 "zh-cat",
188 "cln",
189 "catlangname",
190 "c",
191 "topics",
192 "top",
193 "catlangcode",
194 "topic",
195 ]
196)
199def extract_category_templates(
200 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
201):
202 categories = {}
203 for node in level_node.find_child(NodeKind.TEMPLATE):
204 if node.template_name.lower() in CATEGORY_TEMPLATES: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 clean_node(wxr, categories, node)
206 for data in page_data:
207 if data.lang_code == page_data[-1].lang_code:
208 data.categories.extend(categories.get("categories", []))
211def extract_zh_forms(
212 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
213):
214 base_data.literal_meaning = clean_node(
215 wxr, None, t_node.template_parameters.get("lit", "")
216 )
217 expanded_node = wxr.wtp.parse(
218 wxr.wtp.node_to_wikitext(t_node), expand_all=True
219 )
220 for table in expanded_node.find_child(NodeKind.TABLE):
221 for row in table.find_child(NodeKind.TABLE_ROW):
222 row_header = ""
223 row_header_tags = []
224 header_has_span = False
225 for cell in row.find_child(
226 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
227 ):
228 if cell.kind == NodeKind.TABLE_HEADER_CELL:
229 row_header, row_header_tags, header_has_span = (
230 extract_zh_forms_header_cell(wxr, base_data, cell)
231 )
232 elif not header_has_span:
233 extract_zh_forms_data_cell(
234 wxr, base_data, cell, row_header, row_header_tags
235 )
238def extract_zh_forms_header_cell(
239 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode
240) -> tuple[str, list[str], bool]:
241 row_header = ""
242 row_header_tags = []
243 header_has_span = False
244 first_span_index = len(header_cell.children)
245 for index, span_tag in header_cell.find_html("span", with_index=True):
246 if index < first_span_index:
247 first_span_index = index
248 header_has_span = True
249 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
250 for raw_tag in re.split(r"/|และ", row_header):
251 raw_tag = raw_tag.strip()
252 if raw_tag != "":
253 row_header_tags.append(raw_tag)
254 for span_tag in header_cell.find_html_recursively("span"):
255 span_lang = span_tag.attrs.get("lang", "")
256 form_nodes = []
257 sup_title = ""
258 for node in span_tag.children:
259 if isinstance(node, HTMLNode) and node.tag == "sup":
260 for sup_span in node.find_html("span"):
261 sup_title = sup_span.attrs.get("title", "")
262 else:
263 form_nodes.append(node)
264 if span_lang in ["zh-Hant", "zh-Hans"]:
265 for word in clean_node(wxr, None, form_nodes).split("/"):
266 if word not in [base_data.word, ""]:
267 form = Form(form=word, raw_tags=row_header_tags)
268 if sup_title != "":
269 form.raw_tags.append(sup_title)
270 translate_raw_tags(form)
271 base_data.forms.append(form)
272 return row_header, row_header_tags, header_has_span
275def extract_zh_forms_data_cell(
276 wxr: WiktextractContext,
277 base_data: WordEntry,
278 cell: WikiNode,
279 row_header: str,
280 row_header_tags: list[str],
281):
282 forms = []
283 for top_span_tag in cell.find_html("span"):
284 span_style = top_span_tag.attrs.get("style", "")
285 span_lang = top_span_tag.attrs.get("lang", "")
286 if span_style == "white-space:nowrap;":
287 extract_zh_forms_data_cell(
288 wxr, base_data, top_span_tag, row_header, row_header_tags
289 )
290 elif "font-size:80%" in span_style:
291 raw_tag = clean_node(wxr, None, top_span_tag)
292 if raw_tag != "":
293 for form in forms:
294 form.raw_tags.append(raw_tag)
295 translate_raw_tags(form)
296 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]:
297 word = clean_node(wxr, None, top_span_tag)
298 if word not in ["", "/", base_data.word]:
299 form = Form(form=word)
300 if row_header != "anagram":
301 form.raw_tags = row_header_tags
302 if span_lang == "zh-Hant":
303 form.tags.append("Traditional-Chinese")
304 elif span_lang == "zh-Hans":
305 form.tags.append("Simplified-Chinese")
306 translate_raw_tags(form)
307 forms.append(form)
309 if row_header == "anagram":
310 for form in forms:
311 base_data.anagrams.append(
312 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags)
313 )
314 else:
315 base_data.forms.extend(forms)
318def extract_ja_see_template(
319 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
320):
321 for key, value in t_node.template_parameters.items():
322 if isinstance(key, int): 322 ↛ 321line 322 didn't jump to line 321 because the condition on line 322 was always true
323 base_data.redirects.append(clean_node(wxr, None, value))
324 clean_node(wxr, base_data, t_node)