Coverage for src/wiktextract/extractor/ko/page.py: 62%

104 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 LevelNode, 

8 NodeKind, 

9 TemplateNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .etymology import extract_etymology_section 

15from .linkage import extract_linkage_section 

16from .models import Form, Sense, WordEntry 

17from .pos import extract_grammar_note_section, extract_pos_section 

18from .section_titles import LINKAGE_SECTIONS, POS_DATA 

19from .sound import ( 

20 SOUND_TEMPLATES, 

21 extract_sound_section, 

22 extract_sound_template, 

23) 

24from .tags import translate_raw_tags 

25from .translation import extract_translation_section 

26 

27 

28def extract_section_categories( 

29 wxr: WiktextractContext, 

30 page_data: list[WordEntry], 

31 base_data: WordEntry, 

32 level_node: LevelNode, 

33) -> None: 

34 for link_node in level_node.find_child(NodeKind.LINK): 

35 clean_node( 

36 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

37 ) 

38 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

39 if t_node.template_name in ["C", "topics"]: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 clean_node( 

41 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

42 ) 

43 

44 

45def parse_section( 

46 wxr: WiktextractContext, 

47 page_data: list[WordEntry], 

48 base_data: WordEntry, 

49 level_node: LevelNode, 

50) -> None: 

51 title_text = clean_node(wxr, None, level_node.largs) 

52 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ") 

53 if "(" in title_text: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 title_text = title_text[: title_text.index("(")] 

55 if title_text.removeprefix("보조 ").strip() in POS_DATA: 

56 orig_page_data_len = len(page_data) 

57 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

58 if ( 

59 len(page_data) == orig_page_data_len 

60 and title_text in LINKAGE_SECTIONS 

61 and len(page_data) > 0 

62 ): # try extract as linkage section 

63 extract_linkage_section( 

64 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text] 

65 ) 

66 elif title_text in LINKAGE_SECTIONS: 

67 extract_linkage_section( 

68 wxr, 

69 page_data[-1] if len(page_data) > 0 else base_data, 

70 level_node, 

71 LINKAGE_SECTIONS[title_text], 

72 ) 

73 elif title_text == "번역": 

74 extract_translation_section( 

75 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

76 ) 

77 elif title_text == "발음": 

78 extract_sound_section( 

79 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

80 ) 

81 elif title_text == "어원": 81 ↛ 89line 81 didn't jump to line 89 because the condition on line 81 was always true

82 extract_etymology_section( 

83 wxr, 

84 page_data[-1] 

85 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0 

86 else base_data, 

87 level_node, 

88 ) 

89 elif title_text == "어법 주의 사항": 

90 extract_grammar_note_section( 

91 wxr, 

92 page_data[-1] if len(page_data) > 0 else base_data, 

93 level_node, 

94 ) 

95 elif title_text in ["다른 표기", "표기"]: 

96 extract_alt_form_section(wxr, base_data, level_node) 

97 elif title_text in [ 

98 "참고 문헌", 

99 "독음", 

100 "자원", 

101 "교차언어", 

102 "관사를 입력하세요", 

103 "각주", 

104 "갤러리", 

105 "참조", 

106 "이체자", 

107 "외부 링크", 

108 ]: 

109 pass # ignore 

110 else: 

111 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63") 

112 

113 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

114 parse_section(wxr, page_data, base_data, next_level) 

115 

116 extract_section_categories(wxr, page_data, base_data, level_node) 

117 

118 

119def parse_language_section( 

120 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode 

121) -> None: 

122 pre_data_len = len(page_data) 

123 lang_name = clean_node(wxr, None, level2_node.largs) 

124 if lang_name == "": 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 lang_name = "unknown" 

126 lang_code = name_to_code(lang_name, "ko") 

127 if lang_code == "": 

128 lang_code = "unknown" 

129 if ( 129 ↛ 133line 129 didn't jump to line 133 because the condition on line 129 was never true

130 wxr.config.capture_language_codes is not None 

131 and lang_code not in wxr.config.capture_language_codes 

132 ): 

133 return 

134 wxr.wtp.start_section(lang_name) 

135 base_data = WordEntry( 

136 word=wxr.wtp.title, 

137 lang_code=lang_code, 

138 lang=lang_name, 

139 pos="unknown", 

140 ) 

141 extract_section_categories(wxr, page_data, base_data, level2_node) 

142 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

143 if t_node.template_name in SOUND_TEMPLATES: 143 ↛ 142line 143 didn't jump to line 142 because the condition on line 143 was always true

144 extract_sound_template(wxr, base_data, t_node) 

145 

146 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

147 parse_section(wxr, page_data, base_data, next_level) 

148 

149 # no POS section 

150 if len(page_data) == pre_data_len: 

151 extract_pos_section(wxr, page_data, base_data, level2_node, "") 

152 

153 

154def parse_page( 

155 wxr: WiktextractContext, page_title: str, page_text: str 

156) -> list[dict[str, Any]]: 

157 # page layout 

158 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식 

159 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부 

160 if page_title.startswith(("Appendix:", "T195546/NS111")): 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 return [] 

162 wxr.wtp.start_page(page_title) 

163 tree = wxr.wtp.parse(page_text) 

164 page_data: list[WordEntry] = [] 

165 for level2_node in tree.find_child(NodeKind.LEVEL2): 

166 parse_language_section(wxr, page_data, level2_node) 

167 

168 for data in page_data: 

169 if len(data.senses) == 0: 

170 data.senses.append(Sense(tags=["no-gloss"])) 

171 return [m.model_dump(exclude_defaults=True) for m in page_data] 

172 

173 

174def extract_alt_form_section( 

175 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

176): 

177 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

178 if t_node.template_name in ["alt", "alter"]: 

179 extract_alt_template(wxr, base_data, t_node) 

180 

181 

182def extract_alt_template( 

183 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

184): 

185 expanded_node = wxr.wtp.parse( 

186 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

187 ) 

188 forms = [] 

189 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

190 for span_tag in expanded_node.find_html("span"): 

191 span_lang = span_tag.attrs.get("lang", "") 

192 span_class = span_tag.attrs.get("class", "").split() 

193 if span_lang == lang_code: 

194 word = clean_node(wxr, None, span_tag) 

195 if word != "": 

196 forms.append(Form(form=word)) 

197 elif span_lang.endswith("-Latn") and len(forms) > 0: 

198 forms[-1].roman = clean_node(wxr, None, span_tag) 

199 elif "label-content" in span_class and len(forms) > 0: 

200 raw_tag = clean_node(wxr, None, span_tag) 

201 if raw_tag != "": 

202 for form in forms: 

203 form.raw_tags.append(raw_tag) 

204 translate_raw_tags(form) 

205 base_data.forms.extend(forms)