Coverage for src/wiktextract/extractor/th/page.py: 82%

71 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import string 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .alt_form import extract_alt_form_section, extract_romanization_section 

10from .descendant import extract_descendant_section 

11from .etymology import extract_etymology_section 

12from .linkage import extract_linkage_section 

13from .models import Sense, WordEntry 

14from .pos import ( 

15 extract_note_section, 

16 extract_pos_section, 

17 extract_usage_note_section, 

18) 

19from .section_titles import LINKAGE_SECTIONS, POS_DATA 

20from .sound import extract_sound_section 

21from .translation import extract_translation_section 

22 

23 

24def parse_section( 

25 wxr: WiktextractContext, 

26 page_data: list[WordEntry], 

27 base_data: WordEntry, 

28 level_node: LevelNode, 

29) -> None: 

30 title_text = clean_node(wxr, None, level_node.largs) 

31 title_text = title_text.rstrip(string.digits + string.whitespace) 

32 wxr.wtp.start_subsection(title_text) 

33 if title_text in POS_DATA: 

34 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

35 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS: 

36 page_data.pop() 

37 extract_linkage_section( 

38 wxr, 

39 page_data[-1] if len(page_data) > 0 else base_data, 

40 level_node, 

41 LINKAGE_SECTIONS[title_text], 

42 ) 

43 elif ( 

44 len(page_data[-1].senses) == 0 and title_text == "การถอดเป็นอักษรโรมัน" 

45 ): 

46 page_data.pop() 

47 extract_romanization_section( 

48 wxr, 

49 page_data[-1] if len(page_data) > 0 else base_data, 

50 level_node, 

51 ) 

52 elif title_text == "รากศัพท์": 

53 if level_node.contain_node(LEVEL_KIND_FLAGS): 53 ↛ 55line 53 didn't jump to line 55 because the condition on line 53 was always true

54 base_data = base_data.model_copy(deep=True) 

55 extract_etymology_section(wxr, base_data, level_node) 

56 elif title_text in ["คำแปลภาษาอื่น", "คำแปล"]: 

57 extract_translation_section( 

58 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

59 ) 

60 elif title_text in LINKAGE_SECTIONS: 

61 extract_linkage_section( 

62 wxr, 

63 page_data[-1] if len(page_data) > 0 else base_data, 

64 level_node, 

65 LINKAGE_SECTIONS[title_text], 

66 ) 

67 elif title_text == "คำสืบทอด": 

68 extract_descendant_section( 

69 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

70 ) 

71 elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง", "ออกเสียง")): 

72 extract_sound_section(wxr, base_data, level_node) 

73 elif title_text == "รูปแบบอื่น": 73 ↛ 83line 73 didn't jump to line 83 because the condition on line 73 was always true

74 extract_alt_form_section( 

75 wxr, 

76 page_data[-1] 

77 if len(page_data) > 0 

78 and page_data[-1].lang_code == base_data.lang_code 

79 and page_data[-1].pos == base_data.pos 

80 else base_data, 

81 level_node, 

82 ) 

83 elif title_text == "การใช้": 

84 extract_note_section( 

85 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

86 ) 

87 elif title_text == "หมายเหตุการใช้": 

88 extract_usage_note_section( 

89 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

90 ) 

91 elif title_text not in [ 

92 "ดูเพิ่ม", # see more 

93 "อ้างอิง", # references 

94 "อ่านเพิ่ม", # read more 

95 "อ่านเพิ่มเติม", # read more 

96 "รากอักขระ", # glyph origin 

97 "การผันรูป", # conjugation 

98 "การผัน", # conjugation 

99 "คำกริยาในรูปต่าง ๆ", # verb forms 

100 "การอ่าน", # Japanese readings 

101 "การผันคำกริยา", # conjugation 

102 "การผันคำ", # inflection 

103 "การกลายรูป", # conjugation 

104 "การผันคำนาม", # inflection 

105 ]: 

106 wxr.wtp.debug(f"Unknown title: {title_text}") 

107 

108 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

109 parse_section(wxr, page_data, base_data, next_level) 

110 

111 

112def parse_page( 

113 wxr: WiktextractContext, page_title: str, page_text: str 

114) -> list[dict[str, Any]]: 

115 # page layout 

116 # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน 

117 

118 # skip translation pages 

119 if page_title.endswith("/คำแปลภาษาอื่น"): 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 return [] 

121 wxr.wtp.start_page(page_title) 

122 tree = wxr.wtp.parse(page_text, pre_expand=True) 

123 page_data: list[WordEntry] = [] 

124 for level2_node in tree.find_child(NodeKind.LEVEL2): 

125 lang_name = clean_node(wxr, None, level2_node.largs) 

126 lang_name = lang_name.removeprefix("ภาษา") 

127 lang_code = name_to_code(lang_name, "th") 

128 if lang_code == "": 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true

129 lang_code = "unknown" 

130 if lang_name == "": 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 lang_name = "unknown" 

132 wxr.wtp.start_section(lang_name) 

133 base_data = WordEntry( 

134 word=wxr.wtp.title, 

135 lang_code=lang_code, 

136 lang=lang_name, 

137 pos="unknown", 

138 ) 

139 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

140 parse_section(wxr, page_data, base_data, next_level_node) 

141 

142 for data in page_data: 

143 if len(data.senses) == 0: 

144 data.senses.append(Sense(tags=["no-gloss"])) 

145 return [m.model_dump(exclude_defaults=True) for m in page_data]