Coverage for src / wiktextract / extractor / ms / page.py: 74%

104 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import string 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 LevelNode, 

8 NodeKind, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .linkage import extract_form_section, extract_linkage_section 

15from .models import Sense, WordEntry 

16from .pos import extract_pos_section 

17from .section_titles import FORM_SECTIONS, LINKAGE_SECTIONS, POS_DATA 

18from .sound import extract_sound_section 

19from .translation import extract_translation_section 

20 

21 

22def parse_section( 

23 wxr: WiktextractContext, 

24 page_data: list[WordEntry], 

25 base_data: WordEntry, 

26 level_node: LevelNode, 

27) -> None: 

28 title_text = clean_node(wxr, None, level_node.largs) 

29 wxr.wtp.start_subsection(title_text) 

30 title_text = title_text.rstrip(string.digits + string.whitespace + "IVX") 

31 lower_title = title_text.lower() 

32 if lower_title in POS_DATA: 

33 old_data_len = len(page_data) 

34 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

35 if len(page_data) == old_data_len and lower_title in LINKAGE_SECTIONS: 

36 extract_linkage_section(wxr, page_data, base_data, level_node) 

37 elif lower_title == "etimologi": 

38 extract_etymology_section(wxr, page_data, base_data, level_node) 

39 elif lower_title in FORM_SECTIONS: 

40 extract_form_section( 

41 wxr, 

42 page_data[-1] if len(page_data) > 0 else base_data, 

43 level_node, 

44 FORM_SECTIONS[lower_title], 

45 ) 

46 elif lower_title == "tesaurus" or lower_title in LINKAGE_SECTIONS: 

47 extract_linkage_section(wxr, page_data, base_data, level_node) 

48 elif lower_title == "terjemahan": 

49 extract_translation_section(wxr, page_data, base_data, level_node) 

50 elif lower_title == "sebutan": 50 ↛ 52line 50 didn't jump to line 52 because the condition on line 50 was always true

51 extract_sound_section(wxr, page_data, base_data, level_node) 

52 elif lower_title in ["nota penggunaan", "penggunaan"]: 

53 extract_note_section( 

54 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

55 ) 

56 elif lower_title not in [ 

57 "pautan luar", 

58 "rujukan", 

59 "bacaan lanjut", 

60 "lihat juga", 

61 ]: 

62 wxr.wtp.debug(f"Unknown section: {title_text}", sortid="ms/page/44") 

63 

64 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

65 parse_section(wxr, page_data, base_data, next_level) 

66 for link_node in level_node.find_child(NodeKind.LINK): 66 ↛ 67line 66 didn't jump to line 67 because the loop on line 66 never started

67 clean_node( 

68 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

69 ) 

70 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

71 if t_node.template_name in ["topik", "C", "topics"]: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 clean_node( 

73 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

74 ) 

75 

76 

77def parse_page( 

78 wxr: WiktextractContext, page_title: str, page_text: str 

79) -> list[dict[str, Any]]: 

80 # Page format 

81 # https://ms.wiktionary.org/wiki/Wikikamus:Memulakan_laman_baru#Format_laman 

82 if page_title.startswith(("Portal:", "Reconstruction:")): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 return [] 

84 wxr.wtp.start_page(page_title) 

85 tree = wxr.wtp.parse(page_text, pre_expand=True) 

86 page_data: list[WordEntry] = [] 

87 

88 for level2_node in tree.find_child(NodeKind.LEVEL2): 

89 pre_data_len = len(page_data) 

90 lang_name = clean_node(wxr, None, level2_node.largs) 

91 lang_code = ( 

92 name_to_code(lang_name.removeprefix("Bahasa "), "ms") or "unknown" 

93 ) 

94 wxr.wtp.start_section(lang_name) 

95 base_data = WordEntry( 

96 word=wxr.wtp.title, 

97 lang_code=lang_code, 

98 lang=lang_name, 

99 pos="unknown", 

100 ) 

101 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

102 parse_section(wxr, page_data, base_data, next_level_node) 

103 if len(page_data) == pre_data_len: 

104 page_data.append(base_data.model_copy(deep=True)) 

105 

106 for data in page_data: 

107 if len(data.senses) == 0: 

108 data.senses.append(Sense(tags=["no-gloss"])) 

109 return [m.model_dump(exclude_defaults=True) for m in page_data] 

110 

111 

112def extract_etymology_section( 

113 wxr: WiktextractContext, 

114 page_data: list[WordEntry], 

115 base_data: WordEntry, 

116 level_node: LevelNode, 

117): 

118 cats = {} 

119 e_nodes = [] 

120 e_texts = [] 

121 for node in level_node.children: 

122 if isinstance(node, LevelNode): 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 break 

124 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

125 for list_item in node.find_child(NodeKind.LIST_ITEM): 

126 e_text = clean_node(wxr, cats, list_item.children) 

127 if e_text != "": 127 ↛ 125line 127 didn't jump to line 125 because the condition on line 127 was always true

128 e_texts.append(e_text) 

129 else: 

130 e_nodes.append(node) 

131 if len(e_nodes) > 0: 131 ↛ 135line 131 didn't jump to line 135 because the condition on line 131 was always true

132 e_text = clean_node(wxr, cats, e_nodes) 

133 if e_text != "": 

134 e_texts.append(e_text) 

135 if len(e_texts) == 0: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 return 

137 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 

138 base_data.etymology_texts = e_texts 

139 base_data.categories.extend(cats.get("categories", [])) 

140 elif level_node.kind == NodeKind.LEVEL3: 140 ↛ 146line 140 didn't jump to line 146 because the condition on line 140 was always true

141 for data in page_data: 

142 if data.lang_code == page_data[-1].lang_code: 142 ↛ 141line 142 didn't jump to line 141 because the condition on line 142 was always true

143 data.etymology_texts = e_texts 

144 data.categories.extend(cats.get("categories", [])) 

145 else: 

146 page_data[-1].etymology_texts = e_texts 

147 page_data[-1].categories.extend(cats.get("categories", [])) 

148 

149 

150def extract_note_section( 

151 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

152) -> None: 

153 has_list = False 

154 for list_node in level_node.find_child(NodeKind.LIST): 

155 has_list = True 

156 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

157 note = clean_node(wxr, None, list_item.children) 

158 if note != "": 

159 word_entry.notes.append(note) 

160 if not has_list: 

161 note = clean_node(wxr, None, level_node.children) 

162 if note != "": 

163 word_entry.notes.append(note)