Coverage for src / wiktextract / extractor / ko / page.py: 34%

194 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-01 08:08 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 HTMLNode, 

8 LevelNode, 

9 NodeKind, 

10 TemplateNode, 

11 WikiNode, 

12) 

13 

14from ...page import clean_node 

15from ...wxr_context import WiktextractContext 

16from .etymology import extract_etymology_section, extract_ja_kanjitab_template 

17from .linkage import extract_linkage_section 

18from .models import Form, Linkage, Sense, WordEntry 

19from .pos import extract_grammar_note_section, extract_pos_section 

20from .section_titles import LINKAGE_SECTIONS, POS_DATA 

21from .sound import ( 

22 SOUND_TEMPLATES, 

23 extract_sound_section, 

24 extract_sound_template, 

25) 

26from .tags import translate_raw_tags 

27from .translation import extract_translation_section 

28 

29 

30def extract_section_categories( 

31 wxr: WiktextractContext, 

32 page_data: list[WordEntry], 

33 base_data: WordEntry, 

34 level_node: LevelNode, 

35) -> None: 

36 for link_node in level_node.find_child(NodeKind.LINK): 

37 clean_node( 

38 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

39 ) 

40 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

41 if t_node.template_name in ["C", "topics"]: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 clean_node( 

43 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

44 ) 

45 

46 

47def parse_section( 

48 wxr: WiktextractContext, 

49 page_data: list[WordEntry], 

50 base_data: WordEntry, 

51 level_node: LevelNode, 

52) -> None: 

53 title_text = clean_node(wxr, None, level_node.largs) 

54 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ") 

55 if "(" in title_text: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 title_text = title_text[: title_text.index("(")] 

57 if title_text.removeprefix("보조 ").strip() in POS_DATA: 

58 orig_page_data_len = len(page_data) 

59 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

60 if ( 

61 len(page_data) == orig_page_data_len 

62 and title_text in LINKAGE_SECTIONS 

63 and len(page_data) > 0 

64 ): # try extract as linkage section 

65 extract_linkage_section( 

66 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text] 

67 ) 

68 elif title_text in LINKAGE_SECTIONS: 

69 extract_linkage_section( 

70 wxr, 

71 page_data[-1] if len(page_data) > 0 else base_data, 

72 level_node, 

73 LINKAGE_SECTIONS[title_text], 

74 ) 

75 elif title_text == "번역": 

76 extract_translation_section( 

77 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

78 ) 

79 elif title_text == "발음": 

80 extract_sound_section( 

81 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

82 ) 

83 elif title_text == "어원": 83 ↛ 91line 83 didn't jump to line 91 because the condition on line 83 was always true

84 extract_etymology_section( 

85 wxr, 

86 page_data[-1] 

87 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0 

88 else base_data, 

89 level_node, 

90 ) 

91 elif title_text == "어법 주의 사항": 

92 extract_grammar_note_section( 

93 wxr, 

94 page_data[-1] if len(page_data) > 0 else base_data, 

95 level_node, 

96 ) 

97 elif title_text in ["다른 표기", "표기"]: 

98 extract_alt_form_section(wxr, base_data, level_node) 

99 elif title_text in [ 

100 "참고 문헌", 

101 "독음", 

102 "자원", 

103 "교차언어", 

104 "관사를 입력하세요", 

105 "각주", 

106 "갤러리", 

107 "참조", 

108 "이체자", 

109 "외부 링크", 

110 ]: 

111 pass # ignore 

112 else: 

113 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63") 

114 

115 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

116 parse_section(wxr, page_data, base_data, next_level) 

117 

118 extract_section_categories(wxr, page_data, base_data, level_node) 

119 

120 

121def parse_language_section( 

122 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode 

123) -> None: 

124 pre_data_len = len(page_data) 

125 lang_name = clean_node(wxr, None, level2_node.largs) 

126 if lang_name == "": 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 lang_name = "unknown" 

128 lang_code = name_to_code(lang_name, "ko") 

129 if lang_code == "": 

130 lang_code = "unknown" 

131 if ( 131 ↛ 135line 131 didn't jump to line 135 because the condition on line 131 was never true

132 wxr.config.capture_language_codes is not None 

133 and lang_code not in wxr.config.capture_language_codes 

134 ): 

135 return 

136 wxr.wtp.start_section(lang_name) 

137 base_data = WordEntry( 

138 word=wxr.wtp.title, 

139 lang_code=lang_code, 

140 lang=lang_name, 

141 pos="unknown", 

142 ) 

143 extract_section_categories(wxr, page_data, base_data, level2_node) 

144 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

145 if t_node.template_name in SOUND_TEMPLATES: 145 ↛ 147line 145 didn't jump to line 147 because the condition on line 145 was always true

146 extract_sound_template(wxr, base_data, t_node) 

147 elif t_node.template_name == "zh-see": 

148 base_data.redirects.append( 

149 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

150 ) 

151 clean_node(wxr, base_data, t_node) 

152 elif t_node.template_name in ["ja-see", "ja-see-kango"]: 

153 extract_ja_see_template(wxr, base_data, t_node) 

154 elif t_node.template_name == "zh-forms": 

155 extract_zh_forms(wxr, base_data, t_node) 

156 elif ( 

157 t_node.template_name.endswith("-kanjitab") 

158 or t_node.template_name == "ja-kt" 

159 ): 

160 extract_ja_kanjitab_template(wxr, t_node, base_data) 

161 if len(base_data.redirects) > 0: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 page_data.append(base_data) 

163 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

164 parse_section(wxr, page_data, base_data, next_level) 

165 

166 # no POS section 

167 if len(page_data) == pre_data_len: 

168 extract_pos_section(wxr, page_data, base_data, level2_node, "") 

169 

170 

171def parse_page( 

172 wxr: WiktextractContext, page_title: str, page_text: str 

173) -> list[dict[str, Any]]: 

174 # page layout 

175 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식 

176 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부 

177 if page_title.startswith(("Appendix:", "T195546/NS111")): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 return [] 

179 wxr.wtp.start_page(page_title) 

180 tree = wxr.wtp.parse(page_text) 

181 page_data: list[WordEntry] = [] 

182 for level2_node in tree.find_child(NodeKind.LEVEL2): 

183 parse_language_section(wxr, page_data, level2_node) 

184 

185 for data in page_data: 

186 if len(data.senses) == 0: 

187 data.senses.append(Sense(tags=["no-gloss"])) 

188 return [m.model_dump(exclude_defaults=True) for m in page_data] 

189 

190 

191def extract_alt_form_section( 

192 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

193): 

194 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

195 if t_node.template_name in ["alt", "alter"]: 

196 extract_alt_template(wxr, base_data, t_node) 

197 

198 

199def extract_alt_template( 

200 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

201): 

202 expanded_node = wxr.wtp.parse( 

203 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

204 ) 

205 forms = [] 

206 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

207 for span_tag in expanded_node.find_html("span"): 

208 span_lang = span_tag.attrs.get("lang", "") 

209 span_class = span_tag.attrs.get("class", "").split() 

210 if span_lang == lang_code: 

211 word = clean_node(wxr, None, span_tag) 

212 if word != "": 

213 forms.append(Form(form=word)) 

214 elif span_lang.endswith("-Latn") and len(forms) > 0: 

215 forms[-1].roman = clean_node(wxr, None, span_tag) 

216 elif "label-content" in span_class and len(forms) > 0: 

217 raw_tag = clean_node(wxr, None, span_tag) 

218 if raw_tag != "": 

219 for form in forms: 

220 form.raw_tags.append(raw_tag) 

221 translate_raw_tags(form) 

222 base_data.forms.extend(forms) 

223 

224 

225def extract_ja_see_template( 

226 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

227): 

228 for key, value in t_node.template_parameters.items(): 

229 if isinstance(key, int): 

230 base_data.redirects.append(clean_node(wxr, None, value)) 

231 clean_node(wxr, base_data, t_node) 

232 

233 

234def extract_zh_forms( 

235 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

236): 

237 base_data.literal_meaning = clean_node( 

238 wxr, None, t_node.template_parameters.get("lit", "") 

239 ) 

240 expanded_node = wxr.wtp.parse( 

241 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

242 ) 

243 for table in expanded_node.find_child(NodeKind.TABLE): 

244 for row in table.find_child(NodeKind.TABLE_ROW): 

245 row_header = "" 

246 row_header_tags = [] 

247 header_has_span = False 

248 for cell in row.find_child( 

249 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

250 ): 

251 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

252 row_header, row_header_tags, header_has_span = ( 

253 extract_zh_forms_header_cell(wxr, base_data, cell) 

254 ) 

255 elif not header_has_span: 

256 extract_zh_forms_data_cell( 

257 wxr, base_data, cell, row_header, row_header_tags 

258 ) 

259 

260 

261def extract_zh_forms_header_cell( 

262 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode 

263) -> tuple[str, list[str], bool]: 

264 row_header = "" 

265 row_header_tags = [] 

266 header_has_span = False 

267 first_span_index = len(header_cell.children) 

268 for index, span_tag in header_cell.find_html("span", with_index=True): 

269 if index < first_span_index: 

270 first_span_index = index 

271 header_has_span = True 

272 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

273 for raw_tag in re.split(r"/| 및 ", row_header): 

274 raw_tag = raw_tag.strip() 

275 if raw_tag != "": 

276 row_header_tags.append(raw_tag) 

277 for span_tag in header_cell.find_html_recursively("span"): 

278 span_lang = span_tag.attrs.get("lang", "") 

279 form_nodes = [] 

280 sup_title = "" 

281 for node in span_tag.children: 

282 if isinstance(node, HTMLNode) and node.tag == "sup": 

283 for sup_span in node.find_html("span"): 

284 sup_title = sup_span.attrs.get("title", "") 

285 else: 

286 form_nodes.append(node) 

287 if span_lang in ["zh-Hant", "zh-Hans"]: 

288 for word in clean_node(wxr, None, form_nodes).split("/"): 

289 if word not in [base_data.word, ""]: 

290 form = Form(form=word, raw_tags=row_header_tags) 

291 if sup_title != "": 

292 form.raw_tags.append(sup_title) 

293 translate_raw_tags(form) 

294 base_data.forms.append(form) 

295 return row_header, row_header_tags, header_has_span 

296 

297 

298def extract_zh_forms_data_cell( 

299 wxr: WiktextractContext, 

300 base_data: WordEntry, 

301 cell: WikiNode, 

302 row_header: str, 

303 row_header_tags: list[str], 

304): 

305 forms = [] 

306 for top_span_tag in cell.find_html("span"): 

307 span_style = top_span_tag.attrs.get("style", "") 

308 span_lang = top_span_tag.attrs.get("lang", "") 

309 if span_style == "white-space:nowrap;": 

310 extract_zh_forms_data_cell( 

311 wxr, base_data, top_span_tag, row_header, row_header_tags 

312 ) 

313 elif "font-size:80%" in span_style: 

314 raw_tag = clean_node(wxr, None, top_span_tag) 

315 if raw_tag != "": 

316 for form in forms: 

317 form.raw_tags.append(raw_tag) 

318 translate_raw_tags(form) 

319 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 

320 word = clean_node(wxr, None, top_span_tag) 

321 if word not in ["", "/", base_data.word]: 

322 form = Form(form=word) 

323 if row_header != "anagram": 

324 form.raw_tags = row_header_tags 

325 if span_lang == "zh-Hant": 

326 form.tags.append("Traditional-Chinese") 

327 elif span_lang == "zh-Hans": 

328 form.tags.append("Simplified-Chinese") 

329 translate_raw_tags(form) 

330 forms.append(form) 

331 

332 if row_header == "어구전철": 

333 for form in forms: 

334 base_data.anagrams.append( 

335 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags) 

336 ) 

337 else: 

338 base_data.forms.extend(forms)