Coverage for src/wiktextract/extractor/id/page.py: 85%

55 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import string 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .etymology import extract_etymology_section 

10from .linkage import extract_linkage_section 

11from .models import Sense, WordEntry 

12from .pos import extract_pos_section, extract_usage_section 

13from .section_titles import LINKAGE_SECTIONS, POS_DATA 

14from .sound import extract_sound_section 

15from .translation import extract_translation_section 

16 

17 

18def parse_section( 

19 wxr: WiktextractContext, 

20 page_data: list[WordEntry], 

21 base_data: WordEntry, 

22 level_node: LevelNode, 

23) -> None: 

24 title_text = clean_node(wxr, None, level_node.largs).rstrip( 

25 string.digits + string.whitespace 

26 ) 

27 wxr.wtp.start_subsection(title_text) 

28 if title_text in POS_DATA: 

29 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

30 elif title_text == "Etimologi": 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 extract_etymology_section( 

32 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

33 ) 

34 elif title_text == "Terjemahan": 

35 extract_translation_section( 

36 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

37 ) 

38 elif title_text in [ 

39 "Pelafalan", 

40 "Ejaan", 

41 "Pengucapan", 

42 "Suara", 

43 "Pemenggalan kata", 

44 ]: 

45 extract_sound_section( 

46 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

47 ) 

48 elif title_text in LINKAGE_SECTIONS: 

49 extract_linkage_section( 

50 wxr, 

51 page_data[-1] if len(page_data) > 0 else base_data, 

52 level_node, 

53 LINKAGE_SECTIONS[title_text], 

54 ) 

55 elif title_text in ["Penggunaan", "Catatan penggunaan", "Catatan"]: 55 ↛ 59line 55 didn't jump to line 59 because the condition on line 55 was always true

56 extract_usage_section( 

57 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

58 ) 

59 elif title_text not in [ 

60 "Bacaan lebih lanjut", 

61 "Referensi", 

62 "Pranala luar", 

63 "Rujukan", 

64 "Acuan", 

65 "Bacaan lanjutan", 

66 ]: 

67 wxr.wtp.debug(f"Unknown section: {title_text}", sortid="id/page/47") 

68 

69 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

70 parse_section(wxr, page_data, base_data, next_level) 

71 

72 for link_node in level_node.find_child(NodeKind.LINK): 

73 clean_node( 

74 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

75 ) 

76 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

77 if t_node.template_name.endswith("-cat"): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 clean_node( 

79 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

80 ) 

81 

82 

83def parse_page( 

84 wxr: WiktextractContext, page_title: str, page_text: str 

85) -> list[dict[str, Any]]: 

86 # page layout 

87 # https://id.wiktionary.org/wiki/Wikikamus:Penjelasan_tataletak_entri 

88 # https://id.wiktionary.org/wiki/Wikikamus:Format_Kamus 

89 if page_title.startswith(("Portal:", "Rekonstruksi:", "Thesaurus:", "WK:")): 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 return [] 

91 wxr.wtp.start_page(page_title) 

92 tree = wxr.wtp.parse(page_text, pre_expand=True) 

93 page_data: list[WordEntry] = [] 

94 for level2_node in tree.find_child(NodeKind.LEVEL2): 

95 cats = {} 

96 lang_name = clean_node(wxr, cats, level2_node.largs) or "unknown" 

97 lang_code = ( 

98 name_to_code(lang_name.lower().removeprefix("bahasa "), "id") 

99 or "unknown" 

100 ) 

101 wxr.wtp.start_section(lang_name) 

102 base_data = WordEntry( 

103 word=wxr.wtp.title, 

104 lang_code=lang_code, 

105 lang=lang_name, 

106 pos="unknown", 

107 categories=cats.get("categories", []), 

108 ) 

109 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

110 parse_section(wxr, page_data, base_data, next_level_node) 

111 

112 for data in page_data: 

113 if len(data.senses) == 0: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 data.senses.append(Sense(tags=["no-gloss"])) 

115 return [m.model_dump(exclude_defaults=True) for m in page_data]