Coverage for src/wiktextract/extractor/ms/page.py: 65%

90 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import string 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .linkage import extract_form_section, extract_linkage_section 

10from .models import Sense, WordEntry 

11from .pos import extract_pos_section 

12from .section_titles import FORM_SECTIONS, LINKAGE_SECTIONS, POS_DATA 

13from .sound import extract_sound_section 

14from .translation import extract_translation_section 

15 

16 

17def parse_section( 

18 wxr: WiktextractContext, 

19 page_data: list[WordEntry], 

20 base_data: WordEntry, 

21 level_node: LevelNode, 

22) -> None: 

23 title_text = clean_node(wxr, None, level_node.largs) 

24 wxr.wtp.start_subsection(title_text) 

25 title_text = title_text.rstrip(string.digits + string.whitespace + "IVX") 

26 lower_title = title_text.lower() 

27 if lower_title in POS_DATA: 

28 old_data_len = len(page_data) 

29 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

30 if len(page_data) == old_data_len and lower_title in LINKAGE_SECTIONS: 

31 extract_linkage_section(wxr, page_data, base_data, level_node) 

32 elif lower_title == "etimologi": 

33 extract_etymology_section(wxr, page_data, base_data, level_node) 

34 elif lower_title in FORM_SECTIONS: 

35 extract_form_section( 

36 wxr, 

37 page_data[-1] if len(page_data) > 0 else base_data, 

38 level_node, 

39 FORM_SECTIONS[lower_title], 

40 ) 

41 elif lower_title == "tesaurus" or lower_title in LINKAGE_SECTIONS: 

42 extract_linkage_section(wxr, page_data, base_data, level_node) 

43 elif lower_title == "terjemahan": 

44 extract_translation_section(wxr, page_data, base_data, level_node) 

45 elif lower_title == "sebutan": 45 ↛ 47line 45 didn't jump to line 47 because the condition on line 45 was always true

46 extract_sound_section(wxr, page_data, base_data, level_node) 

47 elif lower_title in ["nota penggunaan", "penggunaan"]: 

48 extract_note_section( 

49 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

50 ) 

51 elif lower_title not in [ 

52 "pautan luar", 

53 "rujukan", 

54 "bacaan lanjut", 

55 "lihat juga", 

56 ]: 

57 wxr.wtp.debug(f"Unknown section: {title_text}", sortid="ms/page/44") 

58 

59 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

60 parse_section(wxr, page_data, base_data, next_level) 

61 for link_node in level_node.find_child(NodeKind.LINK): 61 ↛ 62line 61 didn't jump to line 62 because the loop on line 61 never started

62 clean_node( 

63 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

64 ) 

65 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

66 if t_node.template_name in ["topik", "C", "topics"]: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 clean_node( 

68 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

69 ) 

70 

71 

72def parse_page( 

73 wxr: WiktextractContext, page_title: str, page_text: str 

74) -> list[dict[str, Any]]: 

75 # Page format 

76 # https://ms.wiktionary.org/wiki/Wikikamus:Memulakan_laman_baru#Format_laman 

77 if page_title.startswith(("Portal:", "Reconstruction:")): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 return [] 

79 wxr.wtp.start_page(page_title) 

80 tree = wxr.wtp.parse(page_text, pre_expand=True) 

81 page_data: list[WordEntry] = [] 

82 

83 for level2_node in tree.find_child(NodeKind.LEVEL2): 

84 pre_data_len = len(page_data) 

85 lang_name = clean_node(wxr, None, level2_node.largs) 

86 lang_code = ( 

87 name_to_code(lang_name.removeprefix("Bahasa "), "ms") or "unknown" 

88 ) 

89 wxr.wtp.start_section(lang_name) 

90 base_data = WordEntry( 

91 word=wxr.wtp.title, 

92 lang_code=lang_code, 

93 lang=lang_name, 

94 pos="unknown", 

95 ) 

96 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

97 parse_section(wxr, page_data, base_data, next_level_node) 

98 if len(page_data) == pre_data_len: 

99 page_data.append(base_data.model_copy(deep=True)) 

100 

101 for data in page_data: 

102 if len(data.senses) == 0: 

103 data.senses.append(Sense(tags=["no-gloss"])) 

104 return [m.model_dump(exclude_defaults=True) for m in page_data] 

105 

106 

107def extract_etymology_section( 

108 wxr: WiktextractContext, 

109 page_data: list[WordEntry], 

110 base_data: WordEntry, 

111 level_node: LevelNode, 

112) -> None: 

113 cats = {} 

114 e_text = clean_node( 

115 wxr, cats, list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) 

116 ) 

117 if e_text == "": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 return 

119 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 119 ↛ 122line 119 didn't jump to line 122 because the condition on line 119 was always true

120 base_data.etymology_text = e_text 

121 base_data.categories.extend(cats.get("categories", [])) 

122 elif level_node.kind == NodeKind.LEVEL3: 

123 for data in page_data: 

124 if data.lang_code == page_data[-1].lang_code: 

125 data.etymology_text = e_text 

126 data.categories.extend(cats.get("categories", [])) 

127 else: 

128 page_data[-1].etymology_text = e_text 

129 page_data[-1].categories.extend(cats.get("categories", [])) 

130 

131 

132def extract_note_section( 

133 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

134) -> None: 

135 has_list = False 

136 for list_node in level_node.find_child(NodeKind.LIST): 

137 has_list = True 

138 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

139 note = clean_node(wxr, None, list_item.children) 

140 if note != "": 

141 word_entry.notes.append(note) 

142 if not has_list: 

143 note = clean_node(wxr, None, level_node.children) 

144 if note != "": 

145 word_entry.notes.append(note)