Coverage for src/wiktextract/extractor/ko/pos.py: 89%

139 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2 

3from wikitextprocessor import ( 

4 HTMLNode, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .example import extract_example_list_item 

14from .linkage import ( 

15 LINKAGE_TEMPLATES, 

16 extract_linkage_list_item, 

17 extract_linkage_template, 

18) 

19from .models import AltForm, Form, Sense, WordEntry 

20from .section_titles import LINKAGE_SECTIONS, POS_DATA 

21from .sound import SOUND_TEMPLATES, extract_sound_template 

22from .tags import translate_raw_tags 

23from .translation import extract_translation_template 

24 

25 

26def extract_pos_section( 

27 wxr: WiktextractContext, 

28 page_data: list[WordEntry], 

29 base_data: WordEntry, 

30 level_node: LevelNode, 

31 pos_title: str, 

32) -> None: 

33 page_data.append(base_data.model_copy(deep=True)) 

34 orig_title = pos_title 

35 pos_title = pos_title.removeprefix("보조 ").strip() 

36 if pos_title in POS_DATA: 

37 page_data[-1].pos_title = orig_title 

38 pos_data = POS_DATA[pos_title] 

39 page_data[-1].pos = pos_data["pos"] 

40 page_data[-1].tags.extend(pos_data.get("tags", [])) 

41 if ( 41 ↛ 45line 41 didn't jump to line 45 because the condition on line 41 was never true

42 orig_title.startswith("보조 ") 

43 and "auxiliary" not in page_data[-1].tags 

44 ): 

45 page_data[-1].tags.append("auxiliary") 

46 

47 has_linkage = False 

48 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

49 if isinstance(node, TemplateNode): 

50 if node.template_name in SOUND_TEMPLATES: 

51 extract_sound_template(wxr, page_data[-1], node) 

52 elif node.template_name in LINKAGE_TEMPLATES: 

53 has_linkage = extract_linkage_template(wxr, page_data[-1], node) 

54 elif node.template_name == "외국어": 

55 extract_translation_template( 

56 wxr, 

57 page_data[-1], 

58 node, 

59 page_data[-1].senses[-1].glosses[-1] 

60 if len(page_data[-1].senses) > 0 

61 else "", 

62 ) 

63 elif node.template_name in HEADER_TEMPLATES: 63 ↛ 48line 63 didn't jump to line 48 because the condition on line 63 was always true

64 extract_header_template(wxr, page_data[-1], node) 

65 elif node.kind == NodeKind.LIST: 65 ↛ 48line 65 didn't jump to line 48 because the condition on line 65 was always true

66 for list_item in node.find_child(NodeKind.LIST_ITEM): 

67 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

68 extract_gloss_list_item( 

69 wxr, 

70 page_data[-1], 

71 list_item, 

72 Sense(pattern=page_data[-1].pattern), 

73 ) 

74 else: 

75 extract_unorderd_list_item(wxr, page_data[-1], list_item) 

76 

77 if not ( 

78 len(page_data[-1].senses) > 0 

79 or len(page_data[-1].sounds) > len(base_data.sounds) 

80 or len(page_data[-1].translations) > len(base_data.translations) 

81 or has_linkage 

82 ): 

83 page_data.pop() 

84 

85 

86def extract_gloss_list_item( 

87 wxr: WiktextractContext, 

88 word_entry: WordEntry, 

89 list_item: WikiNode, 

90 parent_sense: Sense, 

91) -> None: 

92 gloss_nodes = [] 

93 sense = parent_sense.model_copy(deep=True) 

94 for node in list_item.children: 

95 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

96 gloss_text = clean_node(wxr, sense, gloss_nodes) 

97 if len(gloss_text) > 0: 97 ↛ 102line 97 didn't jump to line 102 because the condition on line 97 was always true

98 sense.glosses.append(gloss_text) 

99 translate_raw_tags(sense) 

100 word_entry.senses.append(sense) 

101 gloss_nodes.clear() 

102 for nested_list_item in node.find_child(NodeKind.LIST_ITEM): 

103 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

104 extract_gloss_list_item( 

105 wxr, word_entry, nested_list_item, sense 

106 ) 

107 else: 

108 extract_unorderd_list_item( 

109 wxr, word_entry, nested_list_item 

110 ) 

111 continue 

112 elif isinstance(node, TemplateNode) and node.template_name.endswith( 

113 " of" 

114 ): 

115 extract_form_of_template(wxr, sense, node) 

116 gloss_nodes.append(node) 

117 elif isinstance(node, TemplateNode) and node.template_name == "라벨": 

118 sense.raw_tags.extend( 

119 [ 

120 raw_tag.strip() 

121 for raw_tag in clean_node(wxr, sense, node) 

122 .strip("()") 

123 .split(",") 

124 ] 

125 ) 

126 else: 

127 gloss_nodes.append(node) 

128 

129 gloss_text = clean_node(wxr, sense, gloss_nodes) 

130 if len(gloss_text) > 0: 

131 sense.glosses.append(gloss_text) 

132 translate_raw_tags(sense) 

133 word_entry.senses.append(sense) 

134 

135 

136def extract_unorderd_list_item( 

137 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode 

138) -> None: 

139 is_first_bold = True 

140 for index, node in enumerate(list_item.children): 

141 if ( 

142 isinstance(node, WikiNode) 

143 and node.kind == NodeKind.BOLD 

144 and is_first_bold 

145 ): 

146 # `* '''1.''' gloss text`, terrible obsolete layout 

147 is_first_bold = False 

148 bold_text = clean_node(wxr, None, node) 

149 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text): 

150 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0) 

151 new_list_item.children = list_item.children[index + 1 :] 

152 extract_gloss_list_item(wxr, word_entry, new_list_item, Sense()) 

153 break 

154 elif isinstance(node, str) and "어원:" in node: 

155 etymology_nodes = [] 

156 etymology_nodes.append(node[node.index(":") + 1 :]) 

157 etymology_nodes.extend(list_item.children[index + 1 :]) 

158 e_text = clean_node(wxr, None, etymology_nodes) 

159 if len(e_text) > 0: 159 ↛ 161line 159 didn't jump to line 161 because the condition on line 159 was always true

160 word_entry.etymology_texts.append(e_text) 

161 break 

162 elif ( 

163 isinstance(node, str) 

164 and re.search(r"(?:참고|참조|활용):", node) is not None 

165 ): 

166 note_str = node[node.index(":") + 1 :].strip() 

167 note_str += clean_node( 

168 wxr, 

169 word_entry.senses[-1] 

170 if len(word_entry.senses) > 0 

171 else word_entry, 

172 list_item.children[index + 1 :], 

173 ) 

174 if len(word_entry.senses) > 0: 

175 word_entry.senses[-1].note = note_str 

176 else: 

177 word_entry.note = note_str 

178 break 

179 elif ( 

180 isinstance(node, str) 

181 and ":" in node 

182 and node[: node.index(":")].strip() in LINKAGE_SECTIONS 

183 ): 

184 extract_linkage_list_item(wxr, word_entry, list_item, "", False) 

185 break 

186 elif isinstance(node, str) and "문형:" in node: 

187 word_entry.pattern = node[node.index(":") + 1 :].strip() 

188 word_entry.pattern += clean_node( 

189 wxr, None, list_item.children[index + 1 :] 

190 ) 

191 break 

192 else: 

193 if len(word_entry.senses) > 0: 

194 extract_example_list_item( 

195 wxr, word_entry.senses[-1], list_item, word_entry.lang_code 

196 ) 

197 

198 

199def extract_form_of_template( 

200 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

201) -> None: 

202 if "form-of" not in sense.tags: 202 ↛ 204line 202 didn't jump to line 204 because the condition on line 202 was always true

203 sense.tags.append("form-of") 

204 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2 

205 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, "")) 

206 if len(word) > 0: 206 ↛ exitline 206 didn't return from function 'extract_form_of_template' because the condition on line 206 was always true

207 sense.form_of.append(AltForm(word=word)) 

208 

209 

210HEADER_TEMPLATES = frozenset( 

211 [ 

212 "ko-verb", 

213 "한국어 동사", 

214 "ko-noun", 

215 "한국어 명사", 

216 "ko-proper noun", 

217 "한국어 고유명사", 

218 ] 

219) 

220 

221 

222def extract_header_template( 

223 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

224) -> None: 

225 if t_node.template_name in ["ko-verb", "한국어 동사"]: 225 ↛ 227line 225 didn't jump to line 227 because the condition on line 225 was always true

226 extract_ko_verb_template(wxr, word_entry, t_node) 

227 elif t_node.template_name in [ 

228 "ko-noun", 

229 "한국어 명사", 

230 "ko-proper noun", 

231 "한국어 고유명사", 

232 ]: 

233 extract_ko_noun_template(wxr, word_entry, t_node) 

234 

235 

236def extract_ko_verb_template( 

237 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

238) -> None: 

239 # https://ko.wiktionary.org/wiki/틀:한국어_동사 

240 expanded_node = wxr.wtp.parse( 

241 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

242 ) 

243 clean_node(wxr, word_entry, expanded_node) 

244 for top_span_tag in expanded_node.find_html( 

245 "span", attr_name="class", attr_value="headword-line" 

246 ): 

247 raw_tag = "" 

248 for node in top_span_tag.children: 

249 if isinstance(node, str): 

250 if "(" in node: 

251 raw_tag = node[node.rindex("(") + 1 :].strip(", ") 

252 else: 

253 raw_tag = node.strip(", ") 

254 elif isinstance(node, HTMLNode) and node.tag == "b": 

255 form = Form(form=clean_node(wxr, None, node)) 

256 if raw_tag != "": 256 ↛ 258line 256 didn't jump to line 258 because the condition on line 256 was always true

257 form.raw_tags.append(raw_tag) 

258 if form.form != "": 258 ↛ 248line 258 didn't jump to line 248 because the condition on line 258 was always true

259 translate_raw_tags(form) 

260 word_entry.forms.append(form) 

261 

262 

263def extract_ko_noun_template( 

264 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

265) -> None: 

266 # https://ko.wiktionary.org/wiki/틀:한국어_명사 

267 # https://ko.wiktionary.org/wiki/틀:한국어_고유명사 

268 hanja = clean_node(wxr, None, t_node.template_parameters.get("한자", "")) 

269 if hanja != "": 

270 word_entry.forms.append(Form(form=hanja, tags=["hanja"])) 

271 

272 

273def extract_grammar_note_section( 

274 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

275) -> None: 

276 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

277 word_entry.note = clean_node(wxr, None, list_item.children)