Coverage for src/wiktextract/extractor/th/page.py: 82%
71 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import string
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .alt_form import extract_alt_form_section, extract_romanization_section
10from .descendant import extract_descendant_section
11from .etymology import extract_etymology_section
12from .linkage import extract_linkage_section
13from .models import Sense, WordEntry
14from .pos import (
15 extract_note_section,
16 extract_pos_section,
17 extract_usage_note_section,
18)
19from .section_titles import LINKAGE_SECTIONS, POS_DATA
20from .sound import extract_sound_section
21from .translation import extract_translation_section
24def parse_section(
25 wxr: WiktextractContext,
26 page_data: list[WordEntry],
27 base_data: WordEntry,
28 level_node: LevelNode,
29) -> None:
30 title_text = clean_node(wxr, None, level_node.largs)
31 title_text = title_text.rstrip(string.digits + string.whitespace)
32 wxr.wtp.start_subsection(title_text)
33 if title_text in POS_DATA:
34 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
35 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS:
36 page_data.pop()
37 extract_linkage_section(
38 wxr,
39 page_data[-1] if len(page_data) > 0 else base_data,
40 level_node,
41 LINKAGE_SECTIONS[title_text],
42 )
43 elif (
44 len(page_data[-1].senses) == 0 and title_text == "การถอดเป็นอักษรโรมัน"
45 ):
46 page_data.pop()
47 extract_romanization_section(
48 wxr,
49 page_data[-1] if len(page_data) > 0 else base_data,
50 level_node,
51 )
52 elif title_text == "รากศัพท์":
53 if level_node.contain_node(LEVEL_KIND_FLAGS): 53 ↛ 55line 53 didn't jump to line 55 because the condition on line 53 was always true
54 base_data = base_data.model_copy(deep=True)
55 extract_etymology_section(wxr, base_data, level_node)
56 elif title_text in ["คำแปลภาษาอื่น", "คำแปล"]:
57 extract_translation_section(
58 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
59 )
60 elif title_text in LINKAGE_SECTIONS:
61 extract_linkage_section(
62 wxr,
63 page_data[-1] if len(page_data) > 0 else base_data,
64 level_node,
65 LINKAGE_SECTIONS[title_text],
66 )
67 elif title_text == "คำสืบทอด":
68 extract_descendant_section(
69 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
70 )
71 elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง", "ออกเสียง")):
72 extract_sound_section(wxr, base_data, level_node)
73 elif title_text == "รูปแบบอื่น": 73 ↛ 83line 73 didn't jump to line 83 because the condition on line 73 was always true
74 extract_alt_form_section(
75 wxr,
76 page_data[-1]
77 if len(page_data) > 0
78 and page_data[-1].lang_code == base_data.lang_code
79 and page_data[-1].pos == base_data.pos
80 else base_data,
81 level_node,
82 )
83 elif title_text == "การใช้":
84 extract_note_section(
85 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
86 )
87 elif title_text == "หมายเหตุการใช้":
88 extract_usage_note_section(
89 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
90 )
91 elif title_text not in [
92 "ดูเพิ่ม", # see more
93 "อ้างอิง", # references
94 "อ่านเพิ่ม", # read more
95 "อ่านเพิ่มเติม", # read more
96 "รากอักขระ", # glyph origin
97 "การผันรูป", # conjugation
98 "การผัน", # conjugation
99 "คำกริยาในรูปต่าง ๆ", # verb forms
100 "การอ่าน", # Japanese readings
101 "การผันคำกริยา", # conjugation
102 "การผันคำ", # inflection
103 "การกลายรูป", # conjugation
104 "การผันคำนาม", # inflection
105 ]:
106 wxr.wtp.debug(f"Unknown title: {title_text}")
108 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
109 parse_section(wxr, page_data, base_data, next_level)
112def parse_page(
113 wxr: WiktextractContext, page_title: str, page_text: str
114) -> list[dict[str, Any]]:
115 # page layout
116 # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน
118 # skip translation pages
119 if page_title.endswith("/คำแปลภาษาอื่น"): 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 return []
121 wxr.wtp.start_page(page_title)
122 tree = wxr.wtp.parse(page_text, pre_expand=True)
123 page_data: list[WordEntry] = []
124 for level2_node in tree.find_child(NodeKind.LEVEL2):
125 lang_name = clean_node(wxr, None, level2_node.largs)
126 lang_name = lang_name.removeprefix("ภาษา")
127 lang_code = name_to_code(lang_name, "th")
128 if lang_code == "": 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true
129 lang_code = "unknown"
130 if lang_name == "": 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true
131 lang_name = "unknown"
132 wxr.wtp.start_section(lang_name)
133 base_data = WordEntry(
134 word=wxr.wtp.title,
135 lang_code=lang_code,
136 lang=lang_name,
137 pos="unknown",
138 )
139 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
140 parse_section(wxr, page_data, base_data, next_level_node)
142 for data in page_data:
143 if len(data.senses) == 0:
144 data.senses.append(Sense(tags=["no-gloss"]))
145 return [m.model_dump(exclude_defaults=True) for m in page_data]