Coverage for src/wiktextract/extractor/ko/page.py: 85%

72 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .etymology import extract_etymology_section 

10from .linkage import extract_linkage_section 

11from .models import Sense, WordEntry 

12from .pos import extract_grammar_note_section, extract_pos_section 

13from .section_titles import LINKAGE_SECTIONS, POS_DATA 

14from .sound import ( 

15 SOUND_TEMPLATES, 

16 extract_sound_section, 

17 extract_sound_template, 

18) 

19from .translation import extract_translation_section 

20 

21 

22def extract_section_categories( 

23 wxr: WiktextractContext, 

24 page_data: list[WordEntry], 

25 base_data: WordEntry, 

26 level_node: LevelNode, 

27) -> None: 

28 for link_node in level_node.find_child(NodeKind.LINK): 

29 clean_node( 

30 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

31 ) 

32 

33 

34def parse_section( 

35 wxr: WiktextractContext, 

36 page_data: list[WordEntry], 

37 base_data: WordEntry, 

38 level_node: LevelNode, 

39) -> None: 

40 title_text = clean_node(wxr, None, level_node.largs) 

41 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ") 

42 if "(" in title_text: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 title_text = title_text[: title_text.index("(")] 

44 if title_text.removeprefix("보조 ").strip() in POS_DATA: 

45 orig_page_data_len = len(page_data) 

46 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

47 if ( 

48 len(page_data) == orig_page_data_len 

49 and title_text in LINKAGE_SECTIONS 

50 and len(page_data) > 0 

51 ): # try extract as linkage section 

52 extract_linkage_section( 

53 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text] 

54 ) 

55 elif title_text in LINKAGE_SECTIONS: 

56 extract_linkage_section( 

57 wxr, 

58 page_data[-1] if len(page_data) > 0 else base_data, 

59 level_node, 

60 LINKAGE_SECTIONS[title_text], 

61 ) 

62 elif title_text == "번역": 

63 extract_translation_section( 

64 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

65 ) 

66 elif title_text == "발음": 

67 extract_sound_section( 

68 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

69 ) 

70 elif title_text == "어원": 70 ↛ 78line 70 didn't jump to line 78 because the condition on line 70 was always true

71 extract_etymology_section( 

72 wxr, 

73 page_data[-1] 

74 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0 

75 else base_data, 

76 level_node, 

77 ) 

78 elif title_text == "어법 주의 사항": 

79 extract_grammar_note_section( 

80 wxr, 

81 page_data[-1] if len(page_data) > 0 else base_data, 

82 level_node, 

83 ) 

84 elif title_text in [ 

85 "참고 문헌", 

86 "독음", 

87 "자원", 

88 "교차언어", 

89 "관사를 입력하세요", 

90 "각주", 

91 "갤러리", 

92 "참조", 

93 "이체자", 

94 ]: 

95 pass # ignore 

96 else: 

97 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63") 

98 

99 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

100 parse_section(wxr, page_data, base_data, next_level) 

101 

102 extract_section_categories(wxr, page_data, base_data, level_node) 

103 

104 

105def parse_language_section( 

106 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode 

107) -> None: 

108 pre_data_len = len(page_data) 

109 lang_name = clean_node(wxr, None, level2_node.largs) 

110 if lang_name == "": 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true

111 lang_name = "unknown" 

112 lang_code = name_to_code(lang_name, "ko") 

113 if lang_code == "": 

114 lang_code = "unknown" 

115 if ( 115 ↛ 119line 115 didn't jump to line 119 because the condition on line 115 was never true

116 wxr.config.capture_language_codes is not None 

117 and lang_code not in wxr.config.capture_language_codes 

118 ): 

119 return 

120 wxr.wtp.start_section(lang_name) 

121 base_data = WordEntry( 

122 word=wxr.wtp.title, 

123 lang_code=lang_code, 

124 lang=lang_name, 

125 pos="unknown", 

126 ) 

127 extract_section_categories(wxr, page_data, base_data, level2_node) 

128 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

129 if t_node.template_name in SOUND_TEMPLATES: 129 ↛ 128line 129 didn't jump to line 128 because the condition on line 129 was always true

130 extract_sound_template(wxr, base_data, t_node) 

131 

132 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

133 parse_section(wxr, page_data, base_data, next_level) 

134 

135 # no POS section 

136 if len(page_data) == pre_data_len: 

137 extract_pos_section(wxr, page_data, base_data, level2_node, "") 

138 

139 

140def parse_page( 

141 wxr: WiktextractContext, page_title: str, page_text: str 

142) -> list[dict[str, Any]]: 

143 # page layout 

144 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식 

145 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부 

146 wxr.wtp.start_page(page_title) 

147 tree = wxr.wtp.parse(page_text) 

148 page_data: list[WordEntry] = [] 

149 for level2_node in tree.find_child(NodeKind.LEVEL2): 

150 parse_language_section(wxr, page_data, level2_node) 

151 

152 for data in page_data: 

153 if len(data.senses) == 0: 

154 data.senses.append(Sense(tags=["no-gloss"])) 

155 return [m.model_dump(exclude_defaults=True) for m in page_data]