Coverage for src / wiktextract / extractor / ko / pos.py: 72%

205 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-26 11:06 +0000

1import re 

2 

3from wikitextprocessor import ( 

4 HTMLNode, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ..ruby import extract_ruby 

14from .example import extract_example_list_item 

15from .linkage import ( 

16 LINKAGE_TEMPLATES, 

17 extract_linkage_list_item, 

18 extract_linkage_template, 

19) 

20from .models import AltForm, Classifier, Form, Sense, WordEntry 

21from .section_titles import LINKAGE_SECTIONS, POS_DATA 

22from .sound import SOUND_TEMPLATES, extract_sound_template 

23from .tags import translate_raw_tags 

24from .translation import extract_translation_template 

25 

26 

27def extract_pos_section( 

28 wxr: WiktextractContext, 

29 page_data: list[WordEntry], 

30 base_data: WordEntry, 

31 level_node: LevelNode, 

32 pos_title: str, 

33) -> None: 

34 page_data.append(base_data.model_copy(deep=True)) 

35 orig_title = pos_title 

36 pos_title = pos_title.removeprefix("보조 ").strip() 

37 if pos_title in POS_DATA: 

38 page_data[-1].pos_title = orig_title 

39 pos_data = POS_DATA[pos_title] 

40 page_data[-1].pos = pos_data["pos"] 

41 page_data[-1].tags.extend(pos_data.get("tags", [])) 

42 if ( 42 ↛ 46line 42 didn't jump to line 46 because the condition on line 42 was never true

43 orig_title.startswith("보조 ") 

44 and "auxiliary" not in page_data[-1].tags 

45 ): 

46 page_data[-1].tags.append("auxiliary") 

47 

48 has_linkage = False 

49 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

50 if isinstance(node, TemplateNode): 

51 if node.template_name in SOUND_TEMPLATES: 

52 extract_sound_template(wxr, page_data[-1], node) 

53 elif node.template_name in LINKAGE_TEMPLATES: 

54 has_linkage = extract_linkage_template( 

55 wxr, page_data[-1], node, "derived" 

56 ) 

57 elif node.template_name == "외국어": 

58 extract_translation_template( 

59 wxr, 

60 page_data[-1], 

61 node, 

62 page_data[-1].senses[-1].glosses[-1] 

63 if len(page_data[-1].senses) > 0 

64 else "", 

65 ) 

66 elif node.template_name.startswith( 66 ↛ 49line 66 didn't jump to line 49 because the condition on line 66 was always true

67 base_data.lang_code + "-" 

68 ) or node.template_name.endswith((" 동사", " 명사", " 고유명사")): 

69 extract_headword_line_template(wxr, page_data[-1], node) 

70 elif node.kind == NodeKind.LIST: 70 ↛ 49line 70 didn't jump to line 49 because the condition on line 70 was always true

71 for list_item in node.find_child(NodeKind.LIST_ITEM): 

72 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

73 extract_gloss_list_item( 

74 wxr, 

75 page_data[-1], 

76 list_item, 

77 Sense(pattern=page_data[-1].pattern), 

78 ) 

79 else: 

80 extract_unorderd_list_item(wxr, page_data[-1], list_item) 

81 

82 if not ( 

83 len(page_data[-1].senses) > 0 

84 or len(page_data[-1].sounds) > len(base_data.sounds) 

85 or len(page_data[-1].translations) > len(base_data.translations) 

86 or has_linkage 

87 ): 

88 page_data.pop() 

89 

90 

91def extract_gloss_list_item( 

92 wxr: WiktextractContext, 

93 word_entry: WordEntry, 

94 list_item: WikiNode, 

95 parent_sense: Sense, 

96) -> None: 

97 gloss_nodes = [] 

98 sense = parent_sense.model_copy(deep=True) 

99 for node in list_item.children: 

100 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

101 gloss_text = clean_node(wxr, sense, gloss_nodes) 

102 if len(gloss_text) > 0: 102 ↛ 107line 102 didn't jump to line 107 because the condition on line 102 was always true

103 sense.glosses.append(gloss_text) 

104 translate_raw_tags(sense) 

105 word_entry.senses.append(sense) 

106 gloss_nodes.clear() 

107 for nested_list_item in node.find_child(NodeKind.LIST_ITEM): 

108 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

109 extract_gloss_list_item( 

110 wxr, word_entry, nested_list_item, sense 

111 ) 

112 else: 

113 extract_unorderd_list_item( 

114 wxr, word_entry, nested_list_item 

115 ) 

116 continue 

117 elif isinstance(node, TemplateNode) and node.template_name.endswith( 

118 " of" 

119 ): 

120 extract_form_of_template(wxr, sense, node) 

121 gloss_nodes.append(node) 

122 elif isinstance(node, TemplateNode) and node.template_name == "라벨": 

123 sense.raw_tags.extend( 

124 [ 

125 raw_tag.strip() 

126 for raw_tag in clean_node(wxr, sense, node) 

127 .strip("()") 

128 .split(",") 

129 ] 

130 ) 

131 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 extract_zh_mw_template(wxr, node, sense) 

133 else: 

134 gloss_nodes.append(node) 

135 

136 gloss_text = clean_node(wxr, sense, gloss_nodes) 

137 if len(gloss_text) > 0: 

138 sense.glosses.append(gloss_text) 

139 translate_raw_tags(sense) 

140 word_entry.senses.append(sense) 

141 

142 

143def extract_unorderd_list_item( 

144 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode 

145) -> None: 

146 is_first_bold = True 

147 for index, node in enumerate(list_item.children): 

148 if ( 

149 isinstance(node, WikiNode) 

150 and node.kind == NodeKind.BOLD 

151 and is_first_bold 

152 ): 

153 # `* '''1.''' gloss text`, terrible obsolete layout 

154 is_first_bold = False 

155 bold_text = clean_node(wxr, None, node) 

156 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text): 

157 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0) 

158 new_list_item.children = list_item.children[index + 1 :] 

159 extract_gloss_list_item(wxr, word_entry, new_list_item, Sense()) 

160 break 

161 elif isinstance(node, str) and "어원:" in node: 

162 etymology_nodes = [] 

163 etymology_nodes.append(node[node.index(":") + 1 :]) 

164 etymology_nodes.extend(list_item.children[index + 1 :]) 

165 e_text = clean_node(wxr, None, etymology_nodes) 

166 if len(e_text) > 0: 166 ↛ 168line 166 didn't jump to line 168 because the condition on line 166 was always true

167 word_entry.etymology_texts.append(e_text) 

168 break 

169 elif ( 

170 isinstance(node, str) 

171 and re.search(r"(?:참고|참조|활용):", node) is not None 

172 ): 

173 note_str = node[node.index(":") + 1 :].strip() 

174 note_str += clean_node( 

175 wxr, 

176 word_entry.senses[-1] 

177 if len(word_entry.senses) > 0 

178 else word_entry, 

179 list_item.children[index + 1 :], 

180 ) 

181 if len(word_entry.senses) > 0: 

182 word_entry.senses[-1].note = note_str 

183 else: 

184 word_entry.note = note_str 

185 break 

186 elif ( 

187 isinstance(node, str) 

188 and ":" in node 

189 and node[: node.index(":")].strip() in LINKAGE_SECTIONS 

190 ): 

191 extract_linkage_list_item(wxr, word_entry, list_item, "", False) 

192 break 

193 elif isinstance(node, str) and "문형:" in node: 

194 word_entry.pattern = node[node.index(":") + 1 :].strip() 

195 word_entry.pattern += clean_node( 

196 wxr, None, list_item.children[index + 1 :] 

197 ) 

198 break 

199 else: 

200 if len(word_entry.senses) > 0: 

201 extract_example_list_item( 

202 wxr, word_entry.senses[-1], list_item, word_entry.lang_code 

203 ) 

204 

205 

206def extract_form_of_template( 

207 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

208) -> None: 

209 if "form-of" not in sense.tags: 209 ↛ 211line 209 didn't jump to line 211 because the condition on line 209 was always true

210 sense.tags.append("form-of") 

211 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2 

212 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, "")) 

213 if len(word) > 0: 213 ↛ exitline 213 didn't return from function 'extract_form_of_template' because the condition on line 213 was always true

214 sense.form_of.append(AltForm(word=word)) 

215 

216 

217def extract_grammar_note_section( 

218 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

219) -> None: 

220 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

221 word_entry.note = clean_node(wxr, None, list_item.children) 

222 

223 

224def extract_zh_mw_template( 

225 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense 

226) -> None: 

227 # Chinese inline classifier template 

228 # copied from zh edition code 

229 expanded_node = wxr.wtp.parse( 

230 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

231 ) 

232 classifiers = [] 

233 last_word = "" 

234 for span_tag in expanded_node.find_html_recursively("span"): 

235 span_class = span_tag.attrs.get("class", "") 

236 if span_class in ["Hani", "Hant", "Hans"]: 

237 word = clean_node(wxr, None, span_tag) 

238 if word != "/": 

239 classifier = Classifier(classifier=word) 

240 if span_class == "Hant": 

241 classifier.tags.append("Traditional-Chinese") 

242 elif span_class == "Hans": 

243 classifier.tags.append("Simplified-Chinese") 

244 

245 if len(classifiers) > 0 and last_word != "/": 

246 sense.classifiers.extend(classifiers) 

247 classifiers.clear() 

248 classifiers.append(classifier) 

249 last_word = word 

250 elif "title" in span_tag.attrs: 

251 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

252 if len(raw_tag) > 0: 

253 for classifier in classifiers: 

254 classifier.raw_tags.append(raw_tag) 

255 sense.classifiers.extend(classifiers) 

256 for classifier in sense.classifiers: 

257 translate_raw_tags(classifier) 

258 

259 

260def extract_headword_line_template( 

261 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

262): 

263 forms = [] 

264 expanded_node = wxr.wtp.parse( 

265 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

266 ) 

267 for main_span_tag in expanded_node.find_html( 

268 "span", attr_name="class", attr_value="headword-line" 

269 ): 

270 i_tags = [] 

271 for html_node in main_span_tag.find_child(NodeKind.HTML): 

272 class_names = html_node.attrs.get("class", "").split() 

273 if html_node.tag == "strong" and "headword" in class_names: 

274 ruby, no_ruby = extract_ruby(wxr, html_node) 

275 strong_str = clean_node(wxr, None, no_ruby) 

276 if strong_str not in ["", wxr.wtp.title] or len(ruby) > 0: 

277 forms.append( 

278 Form(form=strong_str, tags=["canonical"], ruby=ruby) 

279 ) 

280 elif html_node.tag == "span": 

281 if "headword-tr" in class_names or "tr" in class_names: 

282 roman = clean_node(wxr, None, html_node) 

283 if ( 

284 len(forms) > 0 

285 and "canonical" not in forms[-1].tags 

286 and "romanization" not in forms[-1].tags 

287 ): 

288 forms[-1].roman = roman 

289 elif roman != "": 289 ↛ 271line 289 didn't jump to line 271 because the condition on line 289 was always true

290 forms.append(Form(form=roman, tags=["romanization"])) 

291 elif "gender" in class_names: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 for abbr_tag in html_node.find_html("abbr"): 

293 gender_tag = clean_node(wxr, None, abbr_tag) 

294 if ( 

295 len(forms) > 0 

296 and "canonical" not in forms[-1].tags 

297 and "romanization" not in forms[-1].tags 

298 ): 

299 forms[-1].raw_tags.append(gender_tag) 

300 else: 

301 word_entry.raw_tags.append(gender_tag) 

302 elif "ib-content" in class_names: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true

303 raw_tag = clean_node(wxr, None, html_node) 

304 if raw_tag != "": 

305 word_entry.raw_tags.append(raw_tag) 

306 elif html_node.tag == "sup" and word_entry.lang_code == "ja": 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 forms.append(extract_historical_kana(wxr, html_node)) 

308 elif html_node.tag == "i": 

309 if len(i_tags) > 0: 

310 word_entry.raw_tags.extend(i_tags) 

311 i_tags.clear() 

312 for i_child in html_node.children: 

313 raw_tag = ( 

314 clean_node(wxr, None, i_child) 

315 .removeprefix("^†") 

316 .strip() 

317 ) 

318 if raw_tag != "": 318 ↛ 312line 318 didn't jump to line 312 because the condition on line 318 was always true

319 i_tags.append(raw_tag) 

320 elif html_node.tag == "b": 320 ↛ 271line 320 didn't jump to line 271 because the condition on line 320 was always true

321 ruby, no_ruby = extract_ruby(wxr, html_node) 

322 for form_str in filter( 

323 None, 

324 map(str.strip, clean_node(wxr, None, no_ruby).split(",")), 

325 ): 

326 form = Form(form=form_str, ruby=ruby) 

327 if i_tags == ["또는"]: 327 ↛ 328line 327 didn't jump to line 328 because the condition on line 327 was never true

328 if len(forms) > 0: 

329 form.raw_tags.extend(forms[-1].raw_tags) 

330 else: 

331 form.raw_tags.extend(i_tags) 

332 forms.append(form) 

333 i_tags.clear() 

334 

335 if len(i_tags) > 0: 335 ↛ 336line 335 didn't jump to line 336 because the condition on line 335 was never true

336 word_entry.raw_tags.extend(i_tags) 

337 for form in forms: 

338 translate_raw_tags(form) 

339 word_entry.forms.extend(forms) 

340 clean_node(wxr, word_entry, expanded_node) 

341 translate_raw_tags(word_entry) 

342 

343 

344def extract_historical_kana( 

345 wxr: WiktextractContext, sup_node: HTMLNode 

346) -> Form: 

347 form = Form(form="", tags=["archaic"]) 

348 for strong_node in sup_node.find_html("strong"): 

349 form.form = clean_node(wxr, None, strong_node) 

350 for span_node in sup_node.find_html( 

351 "span", attr_name="class", attr_value="tr" 

352 ): 

353 form.roman = clean_node(wxr, None, span_node) 

354 return form