Coverage for src/wiktextract/extractor/ko/pos.py: 89%

138 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2 

3from wikitextprocessor import ( 

4 HTMLNode, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .example import extract_example_list_item 

14from .linkage import ( 

15 LINKAGE_TEMPLATES, 

16 extract_linkage_list_item, 

17 extract_linkage_template, 

18) 

19from .models import AltForm, Form, Sense, WordEntry 

20from .section_titles import LINKAGE_SECTIONS, POS_DATA 

21from .sound import SOUND_TEMPLATES, extract_sound_template 

22from .tags import translate_raw_tags 

23from .translation import extract_translation_template 

24 

25 

26def extract_pos_section( 

27 wxr: WiktextractContext, 

28 page_data: list[WordEntry], 

29 base_data: WordEntry, 

30 level_node: LevelNode, 

31 pos_title: str, 

32) -> None: 

33 page_data.append(base_data.model_copy(deep=True)) 

34 orig_title = pos_title 

35 pos_title = pos_title.removeprefix("보조 ").strip() 

36 if pos_title in POS_DATA: 

37 page_data[-1].pos_title = orig_title 

38 pos_data = POS_DATA[pos_title] 

39 page_data[-1].pos = pos_data["pos"] 

40 page_data[-1].tags.extend(pos_data.get("tags", [])) 

41 if ( 41 ↛ 45line 41 didn't jump to line 45 because the condition on line 41 was never true

42 orig_title.startswith("보조 ") 

43 and "auxiliary" not in page_data[-1].tags 

44 ): 

45 page_data[-1].tags.append("auxiliary") 

46 

47 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

48 if isinstance(node, TemplateNode): 

49 if node.template_name in SOUND_TEMPLATES: 

50 extract_sound_template(wxr, page_data[-1], node) 

51 elif node.template_name in LINKAGE_TEMPLATES: 

52 extract_linkage_template(wxr, page_data[-1], node) 

53 elif node.template_name == "외국어": 

54 extract_translation_template( 

55 wxr, 

56 page_data[-1], 

57 node, 

58 page_data[-1].senses[-1].glosses[-1] 

59 if len(page_data[-1].senses) > 0 

60 else "", 

61 ) 

62 elif node.template_name in HEADER_TEMPLATES: 62 ↛ 47line 62 didn't jump to line 47 because the condition on line 62 was always true

63 extract_header_template(wxr, page_data[-1], node) 

64 elif node.kind == NodeKind.LIST: 64 ↛ 47line 64 didn't jump to line 47 because the condition on line 64 was always true

65 for list_item in node.find_child(NodeKind.LIST_ITEM): 

66 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

67 extract_gloss_list_item( 

68 wxr, 

69 page_data[-1], 

70 list_item, 

71 Sense(pattern=page_data[-1].pattern), 

72 ) 

73 else: 

74 extract_unorderd_list_item(wxr, page_data[-1], list_item) 

75 

76 if len( 

77 page_data[-1].model_dump( 

78 exclude_defaults=True, exclude={"pos_title", "tags"} 

79 ) 

80 ) == len(base_data.model_dump(exclude_defaults=True)): 

81 page_data.pop() 

82 

83 

84def extract_gloss_list_item( 

85 wxr: WiktextractContext, 

86 word_entry: WordEntry, 

87 list_item: WikiNode, 

88 parent_sense: Sense, 

89) -> None: 

90 gloss_nodes = [] 

91 sense = parent_sense.model_copy(deep=True) 

92 for node in list_item.children: 

93 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

94 gloss_text = clean_node(wxr, sense, gloss_nodes) 

95 if len(gloss_text) > 0: 95 ↛ 100line 95 didn't jump to line 100 because the condition on line 95 was always true

96 sense.glosses.append(gloss_text) 

97 translate_raw_tags(sense) 

98 word_entry.senses.append(sense) 

99 gloss_nodes.clear() 

100 for nested_list_item in node.find_child(NodeKind.LIST_ITEM): 

101 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

102 extract_gloss_list_item( 

103 wxr, word_entry, nested_list_item, sense 

104 ) 

105 else: 

106 extract_unorderd_list_item( 

107 wxr, word_entry, nested_list_item 

108 ) 

109 continue 

110 elif isinstance(node, TemplateNode) and node.template_name.endswith( 

111 " of" 

112 ): 

113 extract_form_of_template(wxr, sense, node) 

114 gloss_nodes.append(node) 

115 elif isinstance(node, TemplateNode) and node.template_name == "라벨": 

116 sense.raw_tags.extend( 

117 [ 

118 raw_tag.strip() 

119 for raw_tag in clean_node(wxr, sense, node) 

120 .strip("()") 

121 .split(",") 

122 ] 

123 ) 

124 else: 

125 gloss_nodes.append(node) 

126 

127 gloss_text = clean_node(wxr, sense, gloss_nodes) 

128 if len(gloss_text) > 0: 

129 sense.glosses.append(gloss_text) 

130 translate_raw_tags(sense) 

131 word_entry.senses.append(sense) 

132 

133 

134def extract_unorderd_list_item( 

135 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode 

136) -> None: 

137 is_first_bold = True 

138 for index, node in enumerate(list_item.children): 

139 if ( 

140 isinstance(node, WikiNode) 

141 and node.kind == NodeKind.BOLD 

142 and is_first_bold 

143 ): 

144 # `* '''1.''' gloss text`, terrible obsolete layout 

145 is_first_bold = False 

146 bold_text = clean_node(wxr, None, node) 

147 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text): 

148 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0) 

149 new_list_item.children = list_item.children[index + 1 :] 

150 extract_gloss_list_item(wxr, word_entry, new_list_item, Sense()) 

151 break 

152 elif isinstance(node, str) and "어원:" in node: 

153 etymology_nodes = [] 

154 etymology_nodes.append(node[node.index(":") + 1 :]) 

155 etymology_nodes.extend(list_item.children[index + 1 :]) 

156 e_text = clean_node(wxr, None, etymology_nodes) 

157 if len(e_text) > 0: 157 ↛ 159line 157 didn't jump to line 159 because the condition on line 157 was always true

158 word_entry.etymology_texts.append(e_text) 

159 break 

160 elif ( 

161 isinstance(node, str) 

162 and re.search(r"(?:참고|참조|활용):", node) is not None 

163 ): 

164 note_str = node[node.index(":") + 1 :].strip() 

165 note_str += clean_node( 

166 wxr, 

167 word_entry.senses[-1] 

168 if len(word_entry.senses) > 0 

169 else word_entry, 

170 list_item.children[index + 1 :], 

171 ) 

172 if len(word_entry.senses) > 0: 

173 word_entry.senses[-1].note = note_str 

174 else: 

175 word_entry.note = note_str 

176 break 

177 elif ( 

178 isinstance(node, str) 

179 and ":" in node 

180 and node[: node.index(":")].strip() in LINKAGE_SECTIONS 

181 ): 

182 extract_linkage_list_item(wxr, word_entry, list_item, "", False) 

183 break 

184 elif isinstance(node, str) and "문형:" in node: 

185 word_entry.pattern = node[node.index(":") + 1 :].strip() 

186 word_entry.pattern += clean_node( 

187 wxr, None, list_item.children[index + 1 :] 

188 ) 

189 break 

190 else: 

191 if len(word_entry.senses) > 0: 

192 extract_example_list_item( 

193 wxr, word_entry.senses[-1], list_item, word_entry.lang_code 

194 ) 

195 

196 

197def extract_form_of_template( 

198 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

199) -> None: 

200 if "form-of" not in sense.tags: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was always true

201 sense.tags.append("form-of") 

202 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2 

203 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, "")) 

204 if len(word) > 0: 204 ↛ exitline 204 didn't return from function 'extract_form_of_template' because the condition on line 204 was always true

205 sense.form_of.append(AltForm(word=word)) 

206 

207 

208HEADER_TEMPLATES = frozenset( 

209 [ 

210 "ko-verb", 

211 "한국어 동사", 

212 "ko-noun", 

213 "한국어 명사", 

214 "ko-proper noun", 

215 "한국어 고유명사", 

216 ] 

217) 

218 

219 

220def extract_header_template( 

221 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

222) -> None: 

223 if t_node.template_name in ["ko-verb", "한국어 동사"]: 223 ↛ 225line 223 didn't jump to line 225 because the condition on line 223 was always true

224 extract_ko_verb_template(wxr, word_entry, t_node) 

225 elif t_node.template_name in [ 

226 "ko-noun", 

227 "한국어 명사", 

228 "ko-proper noun", 

229 "한국어 고유명사", 

230 ]: 

231 extract_ko_noun_template(wxr, word_entry, t_node) 

232 

233 

234def extract_ko_verb_template( 

235 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

236) -> None: 

237 # https://ko.wiktionary.org/wiki/틀:한국어_동사 

238 expanded_node = wxr.wtp.parse( 

239 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

240 ) 

241 clean_node(wxr, word_entry, expanded_node) 

242 for top_span_tag in expanded_node.find_html( 

243 "span", attr_name="class", attr_value="headword-line" 

244 ): 

245 raw_tag = "" 

246 for node in top_span_tag.children: 

247 if isinstance(node, str): 

248 if "(" in node: 

249 raw_tag = node[node.rindex("(") + 1 :].strip(", ") 

250 else: 

251 raw_tag = node.strip(", ") 

252 elif isinstance(node, HTMLNode) and node.tag == "b": 

253 form = Form(form=clean_node(wxr, None, node)) 

254 if raw_tag != "": 254 ↛ 256line 254 didn't jump to line 256 because the condition on line 254 was always true

255 form.raw_tags.append(raw_tag) 

256 if form.form != "": 256 ↛ 246line 256 didn't jump to line 246 because the condition on line 256 was always true

257 translate_raw_tags(form) 

258 word_entry.forms.append(form) 

259 

260 

261def extract_ko_noun_template( 

262 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

263) -> None: 

264 # https://ko.wiktionary.org/wiki/틀:한국어_명사 

265 # https://ko.wiktionary.org/wiki/틀:한국어_고유명사 

266 hanja = clean_node(wxr, None, t_node.template_parameters.get("한자", "")) 

267 if hanja != "": 

268 word_entry.forms.append(Form(form=hanja, tags=["hanja"])) 

269 

270 

271def extract_grammar_note_section( 

272 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

273) -> None: 

274 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

275 word_entry.note = clean_node(wxr, None, list_item.children)