Coverage for src/wiktextract/extractor/th/page.py: 83%

81 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import string 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .alt_form import extract_alt_form_section, extract_romanization_section 

10from .descendant import extract_descendant_section 

11from .etymology import extract_etymology_section 

12from .linkage import extract_linkage_section 

13from .models import Sense, WordEntry 

14from .pos import ( 

15 extract_note_section, 

16 extract_pos_section, 

17 extract_usage_note_section, 

18) 

19from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS 

20from .sound import extract_sound_section 

21from .translation import extract_translation_section 

22 

23 

24def parse_section( 

25 wxr: WiktextractContext, 

26 page_data: list[WordEntry], 

27 base_data: WordEntry, 

28 level_node: LevelNode, 

29) -> None: 

30 title_text = clean_node(wxr, None, level_node.largs) 

31 title_text = title_text.rstrip(string.digits + string.whitespace) 

32 wxr.wtp.start_subsection(title_text) 

33 if title_text in POS_DATA: 

34 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

35 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS: 

36 page_data.pop() 

37 extract_linkage_section( 

38 wxr, 

39 page_data[-1] if len(page_data) > 0 else base_data, 

40 level_node, 

41 LINKAGE_SECTIONS[title_text], 

42 ) 

43 elif ( 

44 len(page_data[-1].senses) == 0 and title_text == "การถอดเป็นอักษรโรมัน" 

45 ): 

46 page_data.pop() 

47 extract_romanization_section( 

48 wxr, 

49 page_data[-1] if len(page_data) > 0 else base_data, 

50 level_node, 

51 ) 

52 elif title_text == "รากศัพท์": 

53 if level_node.contain_node(LEVEL_KIND_FLAGS): 53 ↛ 55line 53 didn't jump to line 55 because the condition on line 53 was always true

54 base_data = base_data.model_copy(deep=True) 

55 extract_etymology_section(wxr, base_data, level_node) 

56 elif title_text in TRANSLATION_SECTIONS: 

57 extract_translation_section( 

58 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

59 ) 

60 elif title_text in LINKAGE_SECTIONS: 

61 extract_linkage_section( 

62 wxr, 

63 page_data[-1] if len(page_data) > 0 else base_data, 

64 level_node, 

65 LINKAGE_SECTIONS[title_text], 

66 ) 

67 elif title_text == "คำสืบทอด": 

68 extract_descendant_section( 

69 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

70 ) 

71 elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง", "ออกเสียง")): 

72 extract_sound_section(wxr, base_data, level_node) 

73 elif title_text == "รูปแบบอื่น": 73 ↛ 83line 73 didn't jump to line 83 because the condition on line 73 was always true

74 extract_alt_form_section( 

75 wxr, 

76 page_data[-1] 

77 if len(page_data) > 0 

78 and page_data[-1].lang_code == base_data.lang_code 

79 and page_data[-1].pos == base_data.pos 

80 else base_data, 

81 level_node, 

82 ) 

83 elif title_text == "การใช้": 

84 extract_note_section( 

85 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

86 ) 

87 elif title_text == "หมายเหตุการใช้": 

88 extract_usage_note_section( 

89 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

90 ) 

91 elif title_text not in [ 

92 "ดูเพิ่ม", # see more 

93 "อ้างอิง", # references 

94 "อ่านเพิ่ม", # read more 

95 "อ่านเพิ่มเติม", # read more 

96 "รากอักขระ", # glyph origin 

97 "การผันรูป", # conjugation 

98 "การผัน", # conjugation 

99 "คำกริยาในรูปต่าง ๆ", # verb forms 

100 "การอ่าน", # Japanese readings 

101 "การผันคำกริยา", # conjugation 

102 "การผันคำ", # inflection 

103 "การกลายรูป", # conjugation 

104 "การผันคำนาม", # inflection 

105 ]: 

106 wxr.wtp.debug(f"Unknown title: {title_text}", sortid="th/page/106") 

107 

108 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

109 parse_section(wxr, page_data, base_data, next_level) 

110 

111 extract_category_templates( 

112 wxr, page_data if len(page_data) else [base_data], level_node 

113 ) 

114 

115 

116def parse_page( 

117 wxr: WiktextractContext, page_title: str, page_text: str 

118) -> list[dict[str, Any]]: 

119 # page layout 

120 # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน 

121 

122 # skip translation pages 

123 if page_title.endswith("/คำแปลภาษาอื่น"): 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 return [] 

125 wxr.wtp.start_page(page_title) 

126 tree = wxr.wtp.parse(page_text, pre_expand=True) 

127 page_data: list[WordEntry] = [] 

128 for level2_node in tree.find_child(NodeKind.LEVEL2): 

129 lang_name = clean_node(wxr, None, level2_node.largs) 

130 lang_name = lang_name.removeprefix("ภาษา") 

131 lang_code = name_to_code(lang_name, "th") 

132 if lang_code == "": 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 lang_code = "unknown" 

134 if lang_name == "": 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true

135 lang_name = "unknown" 

136 wxr.wtp.start_section(lang_name) 

137 base_data = WordEntry( 

138 word=wxr.wtp.title, 

139 lang_code=lang_code, 

140 lang=lang_name, 

141 pos="unknown", 

142 ) 

143 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

144 parse_section(wxr, page_data, base_data, next_level_node) 

145 

146 for data in page_data: 

147 if len(data.senses) == 0: 

148 data.senses.append(Sense(tags=["no-gloss"])) 

149 return [m.model_dump(exclude_defaults=True) for m in page_data] 

150 

151 

152CATEGORY_TEMPLATES = frozenset( 

153 [ 

154 "zh-cat", 

155 "cln", 

156 "catlangname", 

157 "c", 

158 "topics", 

159 "top", 

160 "catlangcode", 

161 "topic", 

162 ] 

163) 

164 

165 

166def extract_category_templates( 

167 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode 

168): 

169 categories = {} 

170 for node in level_node.find_child(NodeKind.TEMPLATE): 

171 if node.template_name.lower() in CATEGORY_TEMPLATES: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true

172 clean_node(wxr, categories, node) 

173 for data in page_data: 

174 if data.lang_code == page_data[-1].lang_code: 

175 data.categories.extend(categories.get("categories", []))