Coverage for src/wiktextract/extractor/ko/pos.py: 74%

168 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2 

3from wikitextprocessor import ( 

4 HTMLNode, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .example import extract_example_list_item 

14from .linkage import ( 

15 LINKAGE_TEMPLATES, 

16 extract_linkage_list_item, 

17 extract_linkage_template, 

18) 

19from .models import AltForm, Classifier, Form, Sense, WordEntry 

20from .section_titles import LINKAGE_SECTIONS, POS_DATA 

21from .sound import SOUND_TEMPLATES, extract_sound_template 

22from .tags import translate_raw_tags 

23from .translation import extract_translation_template 

24 

25 

26def extract_pos_section( 

27 wxr: WiktextractContext, 

28 page_data: list[WordEntry], 

29 base_data: WordEntry, 

30 level_node: LevelNode, 

31 pos_title: str, 

32) -> None: 

33 page_data.append(base_data.model_copy(deep=True)) 

34 orig_title = pos_title 

35 pos_title = pos_title.removeprefix("보조 ").strip() 

36 if pos_title in POS_DATA: 

37 page_data[-1].pos_title = orig_title 

38 pos_data = POS_DATA[pos_title] 

39 page_data[-1].pos = pos_data["pos"] 

40 page_data[-1].tags.extend(pos_data.get("tags", [])) 

41 if ( 41 ↛ 45line 41 didn't jump to line 45 because the condition on line 41 was never true

42 orig_title.startswith("보조 ") 

43 and "auxiliary" not in page_data[-1].tags 

44 ): 

45 page_data[-1].tags.append("auxiliary") 

46 

47 has_linkage = False 

48 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

49 if isinstance(node, TemplateNode): 

50 if node.template_name in SOUND_TEMPLATES: 

51 extract_sound_template(wxr, page_data[-1], node) 

52 elif node.template_name in LINKAGE_TEMPLATES: 

53 has_linkage = extract_linkage_template( 

54 wxr, page_data[-1], node, "derived" 

55 ) 

56 elif node.template_name == "외국어": 

57 extract_translation_template( 

58 wxr, 

59 page_data[-1], 

60 node, 

61 page_data[-1].senses[-1].glosses[-1] 

62 if len(page_data[-1].senses) > 0 

63 else "", 

64 ) 

65 elif node.template_name in HEADER_TEMPLATES: 65 ↛ 48line 65 didn't jump to line 48 because the condition on line 65 was always true

66 extract_header_template(wxr, page_data[-1], node) 

67 elif node.kind == NodeKind.LIST: 67 ↛ 48line 67 didn't jump to line 48 because the condition on line 67 was always true

68 for list_item in node.find_child(NodeKind.LIST_ITEM): 

69 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

70 extract_gloss_list_item( 

71 wxr, 

72 page_data[-1], 

73 list_item, 

74 Sense(pattern=page_data[-1].pattern), 

75 ) 

76 else: 

77 extract_unorderd_list_item(wxr, page_data[-1], list_item) 

78 

79 if not ( 

80 len(page_data[-1].senses) > 0 

81 or len(page_data[-1].sounds) > len(base_data.sounds) 

82 or len(page_data[-1].translations) > len(base_data.translations) 

83 or has_linkage 

84 ): 

85 page_data.pop() 

86 

87 

88def extract_gloss_list_item( 

89 wxr: WiktextractContext, 

90 word_entry: WordEntry, 

91 list_item: WikiNode, 

92 parent_sense: Sense, 

93) -> None: 

94 gloss_nodes = [] 

95 sense = parent_sense.model_copy(deep=True) 

96 for node in list_item.children: 

97 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

98 gloss_text = clean_node(wxr, sense, gloss_nodes) 

99 if len(gloss_text) > 0: 99 ↛ 104line 99 didn't jump to line 104 because the condition on line 99 was always true

100 sense.glosses.append(gloss_text) 

101 translate_raw_tags(sense) 

102 word_entry.senses.append(sense) 

103 gloss_nodes.clear() 

104 for nested_list_item in node.find_child(NodeKind.LIST_ITEM): 

105 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

106 extract_gloss_list_item( 

107 wxr, word_entry, nested_list_item, sense 

108 ) 

109 else: 

110 extract_unorderd_list_item( 

111 wxr, word_entry, nested_list_item 

112 ) 

113 continue 

114 elif isinstance(node, TemplateNode) and node.template_name.endswith( 

115 " of" 

116 ): 

117 extract_form_of_template(wxr, sense, node) 

118 gloss_nodes.append(node) 

119 elif isinstance(node, TemplateNode) and node.template_name == "라벨": 

120 sense.raw_tags.extend( 

121 [ 

122 raw_tag.strip() 

123 for raw_tag in clean_node(wxr, sense, node) 

124 .strip("()") 

125 .split(",") 

126 ] 

127 ) 

128 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true

129 extract_zh_mw_template(wxr, node, sense) 

130 else: 

131 gloss_nodes.append(node) 

132 

133 gloss_text = clean_node(wxr, sense, gloss_nodes) 

134 if len(gloss_text) > 0: 

135 sense.glosses.append(gloss_text) 

136 translate_raw_tags(sense) 

137 word_entry.senses.append(sense) 

138 

139 

140def extract_unorderd_list_item( 

141 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode 

142) -> None: 

143 is_first_bold = True 

144 for index, node in enumerate(list_item.children): 

145 if ( 

146 isinstance(node, WikiNode) 

147 and node.kind == NodeKind.BOLD 

148 and is_first_bold 

149 ): 

150 # `* '''1.''' gloss text`, terrible obsolete layout 

151 is_first_bold = False 

152 bold_text = clean_node(wxr, None, node) 

153 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text): 

154 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0) 

155 new_list_item.children = list_item.children[index + 1 :] 

156 extract_gloss_list_item(wxr, word_entry, new_list_item, Sense()) 

157 break 

158 elif isinstance(node, str) and "어원:" in node: 

159 etymology_nodes = [] 

160 etymology_nodes.append(node[node.index(":") + 1 :]) 

161 etymology_nodes.extend(list_item.children[index + 1 :]) 

162 e_text = clean_node(wxr, None, etymology_nodes) 

163 if len(e_text) > 0: 163 ↛ 165line 163 didn't jump to line 165 because the condition on line 163 was always true

164 word_entry.etymology_texts.append(e_text) 

165 break 

166 elif ( 

167 isinstance(node, str) 

168 and re.search(r"(?:참고|참조|활용):", node) is not None 

169 ): 

170 note_str = node[node.index(":") + 1 :].strip() 

171 note_str += clean_node( 

172 wxr, 

173 word_entry.senses[-1] 

174 if len(word_entry.senses) > 0 

175 else word_entry, 

176 list_item.children[index + 1 :], 

177 ) 

178 if len(word_entry.senses) > 0: 

179 word_entry.senses[-1].note = note_str 

180 else: 

181 word_entry.note = note_str 

182 break 

183 elif ( 

184 isinstance(node, str) 

185 and ":" in node 

186 and node[: node.index(":")].strip() in LINKAGE_SECTIONS 

187 ): 

188 extract_linkage_list_item(wxr, word_entry, list_item, "", False) 

189 break 

190 elif isinstance(node, str) and "문형:" in node: 

191 word_entry.pattern = node[node.index(":") + 1 :].strip() 

192 word_entry.pattern += clean_node( 

193 wxr, None, list_item.children[index + 1 :] 

194 ) 

195 break 

196 else: 

197 if len(word_entry.senses) > 0: 

198 extract_example_list_item( 

199 wxr, word_entry.senses[-1], list_item, word_entry.lang_code 

200 ) 

201 

202 

203def extract_form_of_template( 

204 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

205) -> None: 

206 if "form-of" not in sense.tags: 206 ↛ 208line 206 didn't jump to line 208 because the condition on line 206 was always true

207 sense.tags.append("form-of") 

208 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2 

209 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, "")) 

210 if len(word) > 0: 210 ↛ exitline 210 didn't return from function 'extract_form_of_template' because the condition on line 210 was always true

211 sense.form_of.append(AltForm(word=word)) 

212 

213 

214HEADER_TEMPLATES = frozenset( 

215 [ 

216 "ko-verb", 

217 "한국어 동사", 

218 "ko-noun", 

219 "한국어 명사", 

220 "ko-proper noun", 

221 "한국어 고유명사", 

222 ] 

223) 

224 

225 

226def extract_header_template( 

227 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

228) -> None: 

229 if t_node.template_name in ["ko-verb", "한국어 동사"]: 229 ↛ 231line 229 didn't jump to line 231 because the condition on line 229 was always true

230 extract_ko_verb_template(wxr, word_entry, t_node) 

231 elif t_node.template_name in [ 

232 "ko-noun", 

233 "한국어 명사", 

234 "ko-proper noun", 

235 "한국어 고유명사", 

236 ]: 

237 extract_ko_noun_template(wxr, word_entry, t_node) 

238 

239 

240def extract_ko_verb_template( 

241 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

242) -> None: 

243 # https://ko.wiktionary.org/wiki/틀:한국어_동사 

244 expanded_node = wxr.wtp.parse( 

245 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

246 ) 

247 clean_node(wxr, word_entry, expanded_node) 

248 for top_span_tag in expanded_node.find_html( 

249 "span", attr_name="class", attr_value="headword-line" 

250 ): 

251 raw_tag = "" 

252 for node in top_span_tag.children: 

253 if isinstance(node, str): 

254 if "(" in node: 

255 raw_tag = node[node.rindex("(") + 1 :].strip(", ") 

256 else: 

257 raw_tag = node.strip(", ") 

258 elif isinstance(node, HTMLNode) and node.tag == "b": 

259 form = Form(form=clean_node(wxr, None, node)) 

260 if raw_tag != "": 260 ↛ 262line 260 didn't jump to line 262 because the condition on line 260 was always true

261 form.raw_tags.append(raw_tag) 

262 if form.form != "": 262 ↛ 252line 262 didn't jump to line 252 because the condition on line 262 was always true

263 translate_raw_tags(form) 

264 word_entry.forms.append(form) 

265 

266 

267def extract_ko_noun_template( 

268 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

269) -> None: 

270 # https://ko.wiktionary.org/wiki/틀:한국어_명사 

271 # https://ko.wiktionary.org/wiki/틀:한국어_고유명사 

272 hanja = clean_node(wxr, None, t_node.template_parameters.get("한자", "")) 

273 if hanja != "": 

274 word_entry.forms.append(Form(form=hanja, tags=["hanja"])) 

275 

276 

277def extract_grammar_note_section( 

278 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

279) -> None: 

280 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

281 word_entry.note = clean_node(wxr, None, list_item.children) 

282 

283 

284def extract_zh_mw_template( 

285 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

286) -> None: 

287 # Chinese inline classifier template 

288 # copied from zh edition code 

289 expanded_node = wxr.wtp.parse( 

290 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

291 ) 

292 classifiers = [] 

293 last_word = "" 

294 for span_tag in expanded_node.find_html_recursively("span"): 

295 span_class = span_tag.attrs.get("class", "") 

296 if span_class in ["Hani", "Hant", "Hans"]: 

297 word = clean_node(wxr, None, span_tag) 

298 if word != "/": 

299 classifier = Classifier(classifier=word) 

300 if span_class == "Hant": 

301 classifier.tags.append("Traditional-Chinese") 

302 elif span_class == "Hans": 

303 classifier.tags.append("Simplified-Chinese") 

304 

305 if len(classifiers) > 0 and last_word != "/": 

306 sense.classifiers.extend(classifiers) 

307 classifiers.clear() 

308 classifiers.append(classifier) 

309 last_word = word 

310 elif "title" in span_tag.attrs: 

311 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

312 if len(raw_tag) > 0: 

313 for classifier in classifiers: 

314 classifier.raw_tags.append(raw_tag) 

315 sense.classifiers.extend(classifiers) 

316 for classifier in sense.classifiers: 

317 translate_raw_tags(classifier)