Coverage for src/wiktextract/extractor/th/page.py: 45%
178 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2import string
3from typing import Any
5from mediawiki_langcodes import name_to_code
6from wikitextprocessor.parser import (
7 LEVEL_KIND_FLAGS,
8 HTMLNode,
9 LevelNode,
10 NodeKind,
11 TemplateNode,
12 WikiNode,
13)
15from ...page import clean_node
16from ...wxr_context import WiktextractContext
17from .alt_form import extract_alt_form_section, extract_romanization_section
18from .descendant import extract_descendant_section
19from .etymology import extract_etymology_section
20from .linkage import extract_linkage_section
21from .models import Form, Linkage, Sense, WordEntry
22from .pos import (
23 extract_note_section,
24 extract_pos_section,
25 extract_usage_note_section,
26)
27from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS
28from .sound import extract_sound_section
29from .tags import translate_raw_tags
30from .translation import extract_translation_section
33def parse_section(
34 wxr: WiktextractContext,
35 page_data: list[WordEntry],
36 base_data: WordEntry,
37 level_node: LevelNode,
38) -> None:
39 title_text = clean_node(wxr, None, level_node.largs)
40 title_text = title_text.rstrip(string.digits + string.whitespace)
41 wxr.wtp.start_subsection(title_text)
42 if title_text in POS_DATA:
43 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
44 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS:
45 page_data.pop()
46 extract_linkage_section(
47 wxr,
48 page_data[-1] if len(page_data) > 0 else base_data,
49 level_node,
50 LINKAGE_SECTIONS[title_text],
51 )
52 elif (
53 len(page_data[-1].senses) == 0 and title_text == "การถอดเป็นอักษรโรมัน"
54 ):
55 page_data.pop()
56 extract_romanization_section(
57 wxr,
58 page_data[-1] if len(page_data) > 0 else base_data,
59 level_node,
60 )
61 elif title_text == "รากศัพท์":
62 if level_node.contain_node(LEVEL_KIND_FLAGS):
63 base_data = base_data.model_copy(deep=True)
64 for t_node in level_node.find_child(NodeKind.TEMPLATE):
65 if t_node.template_name in ["ja-see", "ja-see-kango"]:
66 base_data = base_data.model_copy(deep=True)
67 extract_ja_see_template(wxr, base_data, t_node)
68 if len(base_data.redirects) > 0: 68 ↛ 64line 68 didn't jump to line 64 because the condition on line 68 was always true
69 page_data.append(base_data)
70 extract_etymology_section(wxr, base_data, level_node)
71 elif title_text in TRANSLATION_SECTIONS:
72 extract_translation_section(
73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
74 )
75 elif title_text in LINKAGE_SECTIONS:
76 extract_linkage_section(
77 wxr,
78 page_data[-1] if len(page_data) > 0 else base_data,
79 level_node,
80 LINKAGE_SECTIONS[title_text],
81 )
82 elif title_text == "คำสืบทอด":
83 extract_descendant_section(
84 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
85 )
86 elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง", "ออกเสียง")):
87 extract_sound_section(wxr, base_data, level_node)
88 elif title_text == "รูปแบบอื่น": 88 ↛ 98line 88 didn't jump to line 98 because the condition on line 88 was always true
89 extract_alt_form_section(
90 wxr,
91 page_data[-1]
92 if len(page_data) > 0
93 and page_data[-1].lang_code == base_data.lang_code
94 and page_data[-1].pos == base_data.pos
95 else base_data,
96 level_node,
97 )
98 elif title_text == "การใช้":
99 extract_note_section(
100 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
101 )
102 elif title_text == "หมายเหตุการใช้":
103 extract_usage_note_section(
104 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
105 )
106 elif title_text not in [
107 "ดูเพิ่ม", # see more
108 "อ้างอิง", # references
109 "อ่านเพิ่ม", # read more
110 "อ่านเพิ่มเติม", # read more
111 "รากอักขระ", # glyph origin
112 "การผันรูป", # conjugation
113 "การผัน", # conjugation
114 "คำกริยาในรูปต่าง ๆ", # verb forms
115 "การอ่าน", # Japanese readings
116 "การผันคำกริยา", # conjugation
117 "การผันคำ", # inflection
118 "การกลายรูป", # conjugation
119 "การผันคำนาม", # inflection
120 ]:
121 wxr.wtp.debug(f"Unknown title: {title_text}", sortid="th/page/106")
123 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
124 parse_section(wxr, page_data, base_data, next_level)
126 extract_category_templates(
127 wxr, page_data if len(page_data) else [base_data], level_node
128 )
131def parse_page(
132 wxr: WiktextractContext, page_title: str, page_text: str
133) -> list[dict[str, Any]]:
134 # page layout
135 # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน
137 # skip translation pages
138 if page_title.endswith("/คำแปลภาษาอื่น"): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 return []
140 wxr.wtp.start_page(page_title)
141 tree = wxr.wtp.parse(page_text, pre_expand=True)
142 page_data: list[WordEntry] = []
143 for level2_node in tree.find_child(NodeKind.LEVEL2):
144 lang_name = clean_node(wxr, None, level2_node.largs)
145 lang_name = lang_name.removeprefix("ภาษา")
146 lang_code = name_to_code(lang_name, "th")
147 if lang_code == "": 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 lang_code = "unknown"
149 if lang_name == "": 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 lang_name = "unknown"
151 wxr.wtp.start_section(lang_name)
152 base_data = WordEntry(
153 word=wxr.wtp.title,
154 lang_code=lang_code,
155 lang=lang_name,
156 pos="unknown",
157 )
158 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 158 ↛ 159line 158 didn't jump to line 159 because the loop on line 158 never started
159 if t_node.template_name == "zh-forms":
160 extract_zh_forms(wxr, base_data, t_node)
161 elif t_node.template_name == "zh-see":
162 base_data.redirects.append(
163 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
164 )
165 clean_node(wxr, base_data, t_node)
166 elif t_node.template_name in ["ja-see", "ja-see-kango"]:
167 extract_ja_see_template(wxr, base_data, t_node)
168 if len(base_data.redirects) > 0: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 page_data.append(base_data)
170 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
171 parse_section(wxr, page_data, base_data, next_level_node)
173 for data in page_data:
174 if len(data.senses) == 0:
175 data.senses.append(Sense(tags=["no-gloss"]))
176 return [m.model_dump(exclude_defaults=True) for m in page_data]
179CATEGORY_TEMPLATES = frozenset(
180 [
181 "zh-cat",
182 "cln",
183 "catlangname",
184 "c",
185 "topics",
186 "top",
187 "catlangcode",
188 "topic",
189 ]
190)
193def extract_category_templates(
194 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
195):
196 categories = {}
197 for node in level_node.find_child(NodeKind.TEMPLATE):
198 if node.template_name.lower() in CATEGORY_TEMPLATES: 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true
199 clean_node(wxr, categories, node)
200 for data in page_data:
201 if data.lang_code == page_data[-1].lang_code:
202 data.categories.extend(categories.get("categories", []))
205def extract_zh_forms(
206 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
207):
208 base_data.literal_meaning = clean_node(
209 wxr, None, t_node.template_parameters.get("lit", "")
210 )
211 expanded_node = wxr.wtp.parse(
212 wxr.wtp.node_to_wikitext(t_node), expand_all=True
213 )
214 for table in expanded_node.find_child(NodeKind.TABLE):
215 for row in table.find_child(NodeKind.TABLE_ROW):
216 row_header = ""
217 row_header_tags = []
218 header_has_span = False
219 for cell in row.find_child(
220 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
221 ):
222 if cell.kind == NodeKind.TABLE_HEADER_CELL:
223 row_header, row_header_tags, header_has_span = (
224 extract_zh_forms_header_cell(wxr, base_data, cell)
225 )
226 elif not header_has_span:
227 extract_zh_forms_data_cell(
228 wxr, base_data, cell, row_header, row_header_tags
229 )
232def extract_zh_forms_header_cell(
233 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode
234) -> tuple[str, list[str], bool]:
235 row_header = ""
236 row_header_tags = []
237 header_has_span = False
238 first_span_index = len(header_cell.children)
239 for index, span_tag in header_cell.find_html("span", with_index=True):
240 if index < first_span_index:
241 first_span_index = index
242 header_has_span = True
243 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
244 for raw_tag in re.split(r"/|และ", row_header):
245 raw_tag = raw_tag.strip()
246 if raw_tag != "":
247 row_header_tags.append(raw_tag)
248 for span_tag in header_cell.find_html_recursively("span"):
249 span_lang = span_tag.attrs.get("lang", "")
250 form_nodes = []
251 sup_title = ""
252 for node in span_tag.children:
253 if isinstance(node, HTMLNode) and node.tag == "sup":
254 for sup_span in node.find_html("span"):
255 sup_title = sup_span.attrs.get("title", "")
256 else:
257 form_nodes.append(node)
258 if span_lang in ["zh-Hant", "zh-Hans"]:
259 for word in clean_node(wxr, None, form_nodes).split("/"):
260 if word not in [base_data.word, ""]:
261 form = Form(form=word, raw_tags=row_header_tags)
262 if sup_title != "":
263 form.raw_tags.append(sup_title)
264 translate_raw_tags(form)
265 base_data.forms.append(form)
266 return row_header, row_header_tags, header_has_span
269def extract_zh_forms_data_cell(
270 wxr: WiktextractContext,
271 base_data: WordEntry,
272 cell: WikiNode,
273 row_header: str,
274 row_header_tags: list[str],
275):
276 forms = []
277 for top_span_tag in cell.find_html("span"):
278 span_style = top_span_tag.attrs.get("style", "")
279 span_lang = top_span_tag.attrs.get("lang", "")
280 if span_style == "white-space:nowrap;":
281 extract_zh_forms_data_cell(
282 wxr, base_data, top_span_tag, row_header, row_header_tags
283 )
284 elif "font-size:80%" in span_style:
285 raw_tag = clean_node(wxr, None, top_span_tag)
286 if raw_tag != "":
287 for form in forms:
288 form.raw_tags.append(raw_tag)
289 translate_raw_tags(form)
290 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]:
291 word = clean_node(wxr, None, top_span_tag)
292 if word not in ["", "/", base_data.word]:
293 form = Form(form=word)
294 if row_header != "anagram":
295 form.raw_tags = row_header_tags
296 if span_lang == "zh-Hant":
297 form.tags.append("Traditional-Chinese")
298 elif span_lang == "zh-Hans":
299 form.tags.append("Simplified-Chinese")
300 translate_raw_tags(form)
301 forms.append(form)
303 if row_header == "anagram":
304 for form in forms:
305 base_data.anagrams.append(
306 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags)
307 )
308 else:
309 base_data.forms.extend(forms)
312def extract_ja_see_template(
313 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
314):
315 for key, value in t_node.template_parameters.items():
316 if isinstance(key, int): 316 ↛ 315line 316 didn't jump to line 315 because the condition on line 316 was always true
317 base_data.redirects.append(clean_node(wxr, None, value))
318 clean_node(wxr, base_data, t_node)