Coverage for src/wiktextract/extractor/ko/page.py: 62%

104 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 LevelNode, 

8 NodeKind, 

9 TemplateNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .etymology import extract_etymology_section 

15from .linkage import extract_linkage_section 

16from .models import Form, Sense, WordEntry 

17from .pos import extract_grammar_note_section, extract_pos_section 

18from .section_titles import LINKAGE_SECTIONS, POS_DATA 

19from .sound import ( 

20 SOUND_TEMPLATES, 

21 extract_sound_section, 

22 extract_sound_template, 

23) 

24from .tags import translate_raw_tags 

25from .translation import extract_translation_section 

26 

27 

28def extract_section_categories( 

29 wxr: WiktextractContext, 

30 page_data: list[WordEntry], 

31 base_data: WordEntry, 

32 level_node: LevelNode, 

33) -> None: 

34 for link_node in level_node.find_child(NodeKind.LINK): 

35 clean_node( 

36 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

37 ) 

38 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

39 if t_node.template_name in ["C", "topics"]: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 clean_node( 

41 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

42 ) 

43 

44 

45def parse_section( 

46 wxr: WiktextractContext, 

47 page_data: list[WordEntry], 

48 base_data: WordEntry, 

49 level_node: LevelNode, 

50) -> None: 

51 title_text = clean_node(wxr, None, level_node.largs) 

52 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ") 

53 if "(" in title_text: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 title_text = title_text[: title_text.index("(")] 

55 if title_text.removeprefix("보조 ").strip() in POS_DATA: 

56 orig_page_data_len = len(page_data) 

57 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

58 if ( 

59 len(page_data) == orig_page_data_len 

60 and title_text in LINKAGE_SECTIONS 

61 and len(page_data) > 0 

62 ): # try extract as linkage section 

63 extract_linkage_section( 

64 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text] 

65 ) 

66 elif title_text in LINKAGE_SECTIONS: 

67 extract_linkage_section( 

68 wxr, 

69 page_data[-1] if len(page_data) > 0 else base_data, 

70 level_node, 

71 LINKAGE_SECTIONS[title_text], 

72 ) 

73 elif title_text == "번역": 

74 extract_translation_section( 

75 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

76 ) 

77 elif title_text == "발음": 

78 extract_sound_section( 

79 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

80 ) 

81 elif title_text == "어원": 81 ↛ 89line 81 didn't jump to line 89 because the condition on line 81 was always true

82 extract_etymology_section( 

83 wxr, 

84 page_data[-1] 

85 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0 

86 else base_data, 

87 level_node, 

88 ) 

89 elif title_text == "어법 주의 사항": 

90 extract_grammar_note_section( 

91 wxr, 

92 page_data[-1] if len(page_data) > 0 else base_data, 

93 level_node, 

94 ) 

95 elif title_text in ["다른 표기", "표기"]: 

96 extract_alt_form_section(wxr, base_data, level_node) 

97 elif title_text in [ 

98 "참고 문헌", 

99 "독음", 

100 "자원", 

101 "교차언어", 

102 "관사를 입력하세요", 

103 "각주", 

104 "갤러리", 

105 "참조", 

106 "이체자", 

107 ]: 

108 pass # ignore 

109 else: 

110 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63") 

111 

112 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

113 parse_section(wxr, page_data, base_data, next_level) 

114 

115 extract_section_categories(wxr, page_data, base_data, level_node) 

116 

117 

118def parse_language_section( 

119 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode 

120) -> None: 

121 pre_data_len = len(page_data) 

122 lang_name = clean_node(wxr, None, level2_node.largs) 

123 if lang_name == "": 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 lang_name = "unknown" 

125 lang_code = name_to_code(lang_name, "ko") 

126 if lang_code == "": 

127 lang_code = "unknown" 

128 if ( 128 ↛ 132line 128 didn't jump to line 132 because the condition on line 128 was never true

129 wxr.config.capture_language_codes is not None 

130 and lang_code not in wxr.config.capture_language_codes 

131 ): 

132 return 

133 wxr.wtp.start_section(lang_name) 

134 base_data = WordEntry( 

135 word=wxr.wtp.title, 

136 lang_code=lang_code, 

137 lang=lang_name, 

138 pos="unknown", 

139 ) 

140 extract_section_categories(wxr, page_data, base_data, level2_node) 

141 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

142 if t_node.template_name in SOUND_TEMPLATES: 142 ↛ 141line 142 didn't jump to line 141 because the condition on line 142 was always true

143 extract_sound_template(wxr, base_data, t_node) 

144 

145 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

146 parse_section(wxr, page_data, base_data, next_level) 

147 

148 # no POS section 

149 if len(page_data) == pre_data_len: 

150 extract_pos_section(wxr, page_data, base_data, level2_node, "") 

151 

152 

153def parse_page( 

154 wxr: WiktextractContext, page_title: str, page_text: str 

155) -> list[dict[str, Any]]: 

156 # page layout 

157 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식 

158 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부 

159 if page_title.startswith(("Appendix:", "T195546/NS111")): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 return [] 

161 wxr.wtp.start_page(page_title) 

162 tree = wxr.wtp.parse(page_text) 

163 page_data: list[WordEntry] = [] 

164 for level2_node in tree.find_child(NodeKind.LEVEL2): 

165 parse_language_section(wxr, page_data, level2_node) 

166 

167 for data in page_data: 

168 if len(data.senses) == 0: 

169 data.senses.append(Sense(tags=["no-gloss"])) 

170 return [m.model_dump(exclude_defaults=True) for m in page_data] 

171 

172 

173def extract_alt_form_section( 

174 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

175): 

176 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

177 if t_node.template_name in ["alt", "alter"]: 

178 extract_alt_template(wxr, base_data, t_node) 

179 

180 

181def extract_alt_template( 

182 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

183): 

184 expanded_node = wxr.wtp.parse( 

185 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

186 ) 

187 forms = [] 

188 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

189 for span_tag in expanded_node.find_html("span"): 

190 span_lang = span_tag.attrs.get("lang", "") 

191 span_class = span_tag.attrs.get("class", "").split() 

192 if span_lang == lang_code: 

193 word = clean_node(wxr, None, span_tag) 

194 if word != "": 

195 forms.append(Form(form=word)) 

196 elif span_lang.endswith("-Latn") and len(forms) > 0: 

197 forms[-1].roman = clean_node(wxr, None, span_tag) 

198 elif "label-content" in span_class and len(forms) > 0: 

199 raw_tag = clean_node(wxr, None, span_tag) 

200 if raw_tag != "": 

201 for form in forms: 

202 form.raw_tags.append(raw_tag) 

203 translate_raw_tags(form) 

204 base_data.forms.extend(forms)