Coverage for src/wiktextract/extractor/th/page.py: 83%
81 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import string
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .alt_form import extract_alt_form_section, extract_romanization_section
10from .descendant import extract_descendant_section
11from .etymology import extract_etymology_section
12from .linkage import extract_linkage_section
13from .models import Sense, WordEntry
14from .pos import (
15 extract_note_section,
16 extract_pos_section,
17 extract_usage_note_section,
18)
19from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS
20from .sound import extract_sound_section
21from .translation import extract_translation_section
24def parse_section(
25 wxr: WiktextractContext,
26 page_data: list[WordEntry],
27 base_data: WordEntry,
28 level_node: LevelNode,
29) -> None:
30 title_text = clean_node(wxr, None, level_node.largs)
31 title_text = title_text.rstrip(string.digits + string.whitespace)
32 wxr.wtp.start_subsection(title_text)
33 if title_text in POS_DATA:
34 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
35 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS:
36 page_data.pop()
37 extract_linkage_section(
38 wxr,
39 page_data[-1] if len(page_data) > 0 else base_data,
40 level_node,
41 LINKAGE_SECTIONS[title_text],
42 )
43 elif (
44 len(page_data[-1].senses) == 0 and title_text == "การถอดเป็นอักษรโรมัน"
45 ):
46 page_data.pop()
47 extract_romanization_section(
48 wxr,
49 page_data[-1] if len(page_data) > 0 else base_data,
50 level_node,
51 )
52 elif title_text == "รากศัพท์":
53 if level_node.contain_node(LEVEL_KIND_FLAGS): 53 ↛ 55line 53 didn't jump to line 55 because the condition on line 53 was always true
54 base_data = base_data.model_copy(deep=True)
55 extract_etymology_section(wxr, base_data, level_node)
56 elif title_text in TRANSLATION_SECTIONS:
57 extract_translation_section(
58 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
59 )
60 elif title_text in LINKAGE_SECTIONS:
61 extract_linkage_section(
62 wxr,
63 page_data[-1] if len(page_data) > 0 else base_data,
64 level_node,
65 LINKAGE_SECTIONS[title_text],
66 )
67 elif title_text == "คำสืบทอด":
68 extract_descendant_section(
69 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
70 )
71 elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง", "ออกเสียง")):
72 extract_sound_section(wxr, base_data, level_node)
73 elif title_text == "รูปแบบอื่น": 73 ↛ 83line 73 didn't jump to line 83 because the condition on line 73 was always true
74 extract_alt_form_section(
75 wxr,
76 page_data[-1]
77 if len(page_data) > 0
78 and page_data[-1].lang_code == base_data.lang_code
79 and page_data[-1].pos == base_data.pos
80 else base_data,
81 level_node,
82 )
83 elif title_text == "การใช้":
84 extract_note_section(
85 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
86 )
87 elif title_text == "หมายเหตุการใช้":
88 extract_usage_note_section(
89 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
90 )
91 elif title_text not in [
92 "ดูเพิ่ม", # see more
93 "อ้างอิง", # references
94 "อ่านเพิ่ม", # read more
95 "อ่านเพิ่มเติม", # read more
96 "รากอักขระ", # glyph origin
97 "การผันรูป", # conjugation
98 "การผัน", # conjugation
99 "คำกริยาในรูปต่าง ๆ", # verb forms
100 "การอ่าน", # Japanese readings
101 "การผันคำกริยา", # conjugation
102 "การผันคำ", # inflection
103 "การกลายรูป", # conjugation
104 "การผันคำนาม", # inflection
105 ]:
106 wxr.wtp.debug(f"Unknown title: {title_text}", sortid="th/page/106")
108 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
109 parse_section(wxr, page_data, base_data, next_level)
111 extract_category_templates(
112 wxr, page_data if len(page_data) else [base_data], level_node
113 )
116def parse_page(
117 wxr: WiktextractContext, page_title: str, page_text: str
118) -> list[dict[str, Any]]:
119 # page layout
120 # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน
122 # skip translation pages
123 if page_title.endswith("/คำแปลภาษาอื่น"): 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 return []
125 wxr.wtp.start_page(page_title)
126 tree = wxr.wtp.parse(page_text, pre_expand=True)
127 page_data: list[WordEntry] = []
128 for level2_node in tree.find_child(NodeKind.LEVEL2):
129 lang_name = clean_node(wxr, None, level2_node.largs)
130 lang_name = lang_name.removeprefix("ภาษา")
131 lang_code = name_to_code(lang_name, "th")
132 if lang_code == "": 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true
133 lang_code = "unknown"
134 if lang_name == "": 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true
135 lang_name = "unknown"
136 wxr.wtp.start_section(lang_name)
137 base_data = WordEntry(
138 word=wxr.wtp.title,
139 lang_code=lang_code,
140 lang=lang_name,
141 pos="unknown",
142 )
143 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
144 parse_section(wxr, page_data, base_data, next_level_node)
146 for data in page_data:
147 if len(data.senses) == 0:
148 data.senses.append(Sense(tags=["no-gloss"]))
149 return [m.model_dump(exclude_defaults=True) for m in page_data]
152CATEGORY_TEMPLATES = frozenset(
153 [
154 "zh-cat",
155 "cln",
156 "catlangname",
157 "c",
158 "topics",
159 "top",
160 "catlangcode",
161 "topic",
162 ]
163)
166def extract_category_templates(
167 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
168):
169 categories = {}
170 for node in level_node.find_child(NodeKind.TEMPLATE):
171 if node.template_name.lower() in CATEGORY_TEMPLATES: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true
172 clean_node(wxr, categories, node)
173 for data in page_data:
174 if data.lang_code == page_data[-1].lang_code:
175 data.categories.extend(categories.get("categories", []))