Coverage for src/wiktextract/extractor/ko/page.py: 34%

192 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-27 07:52 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 HTMLNode, 

8 LevelNode, 

9 NodeKind, 

10 TemplateNode, 

11 WikiNode, 

12) 

13 

14from ...page import clean_node 

15from ...wxr_context import WiktextractContext 

16from .etymology import extract_etymology_section 

17from .linkage import extract_linkage_section 

18from .models import Form, Linkage, Sense, WordEntry 

19from .pos import extract_grammar_note_section, extract_pos_section 

20from .section_titles import LINKAGE_SECTIONS, POS_DATA 

21from .sound import ( 

22 SOUND_TEMPLATES, 

23 extract_sound_section, 

24 extract_sound_template, 

25) 

26from .tags import translate_raw_tags 

27from .translation import extract_translation_section 

28 

29 

30def extract_section_categories( 

31 wxr: WiktextractContext, 

32 page_data: list[WordEntry], 

33 base_data: WordEntry, 

34 level_node: LevelNode, 

35) -> None: 

36 for link_node in level_node.find_child(NodeKind.LINK): 

37 clean_node( 

38 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

39 ) 

40 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

41 if t_node.template_name in ["C", "topics"]: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 clean_node( 

43 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

44 ) 

45 

46 

47def parse_section( 

48 wxr: WiktextractContext, 

49 page_data: list[WordEntry], 

50 base_data: WordEntry, 

51 level_node: LevelNode, 

52) -> None: 

53 title_text = clean_node(wxr, None, level_node.largs) 

54 title_text = re.sub(r"\s*\d+$", "", title_text).strip("() ") 

55 if "(" in title_text: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 title_text = title_text[: title_text.index("(")] 

57 if title_text.removeprefix("보조 ").strip() in POS_DATA: 

58 orig_page_data_len = len(page_data) 

59 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

60 if ( 

61 len(page_data) == orig_page_data_len 

62 and title_text in LINKAGE_SECTIONS 

63 and len(page_data) > 0 

64 ): # try extract as linkage section 

65 extract_linkage_section( 

66 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text] 

67 ) 

68 elif title_text in LINKAGE_SECTIONS: 

69 extract_linkage_section( 

70 wxr, 

71 page_data[-1] if len(page_data) > 0 else base_data, 

72 level_node, 

73 LINKAGE_SECTIONS[title_text], 

74 ) 

75 elif title_text == "번역": 

76 extract_translation_section( 

77 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

78 ) 

79 elif title_text == "발음": 

80 extract_sound_section( 

81 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

82 ) 

83 elif title_text == "어원": 83 ↛ 91line 83 didn't jump to line 91 because the condition on line 83 was always true

84 extract_etymology_section( 

85 wxr, 

86 page_data[-1] 

87 if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0 

88 else base_data, 

89 level_node, 

90 ) 

91 elif title_text == "어법 주의 사항": 

92 extract_grammar_note_section( 

93 wxr, 

94 page_data[-1] if len(page_data) > 0 else base_data, 

95 level_node, 

96 ) 

97 elif title_text in ["다른 표기", "표기"]: 

98 extract_alt_form_section(wxr, base_data, level_node) 

99 elif title_text in [ 

100 "참고 문헌", 

101 "독음", 

102 "자원", 

103 "교차언어", 

104 "관사를 입력하세요", 

105 "각주", 

106 "갤러리", 

107 "참조", 

108 "이체자", 

109 "외부 링크", 

110 ]: 

111 pass # ignore 

112 else: 

113 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63") 

114 

115 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

116 parse_section(wxr, page_data, base_data, next_level) 

117 

118 extract_section_categories(wxr, page_data, base_data, level_node) 

119 

120 

121def parse_language_section( 

122 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode 

123) -> None: 

124 pre_data_len = len(page_data) 

125 lang_name = clean_node(wxr, None, level2_node.largs) 

126 if lang_name == "": 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 lang_name = "unknown" 

128 lang_code = name_to_code(lang_name, "ko") 

129 if lang_code == "": 

130 lang_code = "unknown" 

131 if ( 131 ↛ 135line 131 didn't jump to line 135 because the condition on line 131 was never true

132 wxr.config.capture_language_codes is not None 

133 and lang_code not in wxr.config.capture_language_codes 

134 ): 

135 return 

136 wxr.wtp.start_section(lang_name) 

137 base_data = WordEntry( 

138 word=wxr.wtp.title, 

139 lang_code=lang_code, 

140 lang=lang_name, 

141 pos="unknown", 

142 ) 

143 extract_section_categories(wxr, page_data, base_data, level2_node) 

144 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

145 if t_node.template_name in SOUND_TEMPLATES: 145 ↛ 147line 145 didn't jump to line 147 because the condition on line 145 was always true

146 extract_sound_template(wxr, base_data, t_node) 

147 elif t_node.template_name == "zh-see": 

148 base_data.redirects.append( 

149 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

150 ) 

151 clean_node(wxr, base_data, t_node) 

152 elif t_node.template_name in ["ja-see", "ja-see-kango"]: 

153 extract_ja_see_template(wxr, base_data, t_node) 

154 elif t_node.template_name == "zh-forms": 

155 extract_zh_forms(wxr, base_data, t_node) 

156 if len(base_data.redirects) > 0: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 page_data.append(base_data) 

158 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

159 parse_section(wxr, page_data, base_data, next_level) 

160 

161 # no POS section 

162 if len(page_data) == pre_data_len: 

163 extract_pos_section(wxr, page_data, base_data, level2_node, "") 

164 

165 

166def parse_page( 

167 wxr: WiktextractContext, page_title: str, page_text: str 

168) -> list[dict[str, Any]]: 

169 # page layout 

170 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식 

171 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부 

172 if page_title.startswith(("Appendix:", "T195546/NS111")): 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true

173 return [] 

174 wxr.wtp.start_page(page_title) 

175 tree = wxr.wtp.parse(page_text) 

176 page_data: list[WordEntry] = [] 

177 for level2_node in tree.find_child(NodeKind.LEVEL2): 

178 parse_language_section(wxr, page_data, level2_node) 

179 

180 for data in page_data: 

181 if len(data.senses) == 0: 

182 data.senses.append(Sense(tags=["no-gloss"])) 

183 return [m.model_dump(exclude_defaults=True) for m in page_data] 

184 

185 

186def extract_alt_form_section( 

187 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

188): 

189 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

190 if t_node.template_name in ["alt", "alter"]: 

191 extract_alt_template(wxr, base_data, t_node) 

192 

193 

194def extract_alt_template( 

195 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

196): 

197 expanded_node = wxr.wtp.parse( 

198 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

199 ) 

200 forms = [] 

201 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

202 for span_tag in expanded_node.find_html("span"): 

203 span_lang = span_tag.attrs.get("lang", "") 

204 span_class = span_tag.attrs.get("class", "").split() 

205 if span_lang == lang_code: 

206 word = clean_node(wxr, None, span_tag) 

207 if word != "": 

208 forms.append(Form(form=word)) 

209 elif span_lang.endswith("-Latn") and len(forms) > 0: 

210 forms[-1].roman = clean_node(wxr, None, span_tag) 

211 elif "label-content" in span_class and len(forms) > 0: 

212 raw_tag = clean_node(wxr, None, span_tag) 

213 if raw_tag != "": 

214 for form in forms: 

215 form.raw_tags.append(raw_tag) 

216 translate_raw_tags(form) 

217 base_data.forms.extend(forms) 

218 

219 

220def extract_ja_see_template( 

221 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

222): 

223 for key, value in t_node.template_parameters.items(): 

224 if isinstance(key, int): 

225 base_data.redirects.append(clean_node(wxr, None, value)) 

226 clean_node(wxr, base_data, t_node) 

227 

228 

229def extract_zh_forms( 

230 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

231): 

232 base_data.literal_meaning = clean_node( 

233 wxr, None, t_node.template_parameters.get("lit", "") 

234 ) 

235 expanded_node = wxr.wtp.parse( 

236 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

237 ) 

238 for table in expanded_node.find_child(NodeKind.TABLE): 

239 for row in table.find_child(NodeKind.TABLE_ROW): 

240 row_header = "" 

241 row_header_tags = [] 

242 header_has_span = False 

243 for cell in row.find_child( 

244 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

245 ): 

246 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

247 row_header, row_header_tags, header_has_span = ( 

248 extract_zh_forms_header_cell(wxr, base_data, cell) 

249 ) 

250 elif not header_has_span: 

251 extract_zh_forms_data_cell( 

252 wxr, base_data, cell, row_header, row_header_tags 

253 ) 

254 

255 

256def extract_zh_forms_header_cell( 

257 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode 

258) -> tuple[str, list[str], bool]: 

259 row_header = "" 

260 row_header_tags = [] 

261 header_has_span = False 

262 first_span_index = len(header_cell.children) 

263 for index, span_tag in header_cell.find_html("span", with_index=True): 

264 if index < first_span_index: 

265 first_span_index = index 

266 header_has_span = True 

267 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

268 for raw_tag in re.split(r"/| 및 ", row_header): 

269 raw_tag = raw_tag.strip() 

270 if raw_tag != "": 

271 row_header_tags.append(raw_tag) 

272 for span_tag in header_cell.find_html_recursively("span"): 

273 span_lang = span_tag.attrs.get("lang", "") 

274 form_nodes = [] 

275 sup_title = "" 

276 for node in span_tag.children: 

277 if isinstance(node, HTMLNode) and node.tag == "sup": 

278 for sup_span in node.find_html("span"): 

279 sup_title = sup_span.attrs.get("title", "") 

280 else: 

281 form_nodes.append(node) 

282 if span_lang in ["zh-Hant", "zh-Hans"]: 

283 for word in clean_node(wxr, None, form_nodes).split("/"): 

284 if word not in [base_data.word, ""]: 

285 form = Form(form=word, raw_tags=row_header_tags) 

286 if sup_title != "": 

287 form.raw_tags.append(sup_title) 

288 translate_raw_tags(form) 

289 base_data.forms.append(form) 

290 return row_header, row_header_tags, header_has_span 

291 

292 

293def extract_zh_forms_data_cell( 

294 wxr: WiktextractContext, 

295 base_data: WordEntry, 

296 cell: WikiNode, 

297 row_header: str, 

298 row_header_tags: list[str], 

299): 

300 forms = [] 

301 for top_span_tag in cell.find_html("span"): 

302 span_style = top_span_tag.attrs.get("style", "") 

303 span_lang = top_span_tag.attrs.get("lang", "") 

304 if span_style == "white-space:nowrap;": 

305 extract_zh_forms_data_cell( 

306 wxr, base_data, top_span_tag, row_header, row_header_tags 

307 ) 

308 elif "font-size:80%" in span_style: 

309 raw_tag = clean_node(wxr, None, top_span_tag) 

310 if raw_tag != "": 

311 for form in forms: 

312 form.raw_tags.append(raw_tag) 

313 translate_raw_tags(form) 

314 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 

315 word = clean_node(wxr, None, top_span_tag) 

316 if word not in ["", "/", base_data.word]: 

317 form = Form(form=word) 

318 if row_header != "anagram": 

319 form.raw_tags = row_header_tags 

320 if span_lang == "zh-Hant": 

321 form.tags.append("Traditional-Chinese") 

322 elif span_lang == "zh-Hans": 

323 form.tags.append("Simplified-Chinese") 

324 translate_raw_tags(form) 

325 forms.append(form) 

326 

327 if row_header == "어구전철": 

328 for form in forms: 

329 base_data.anagrams.append( 

330 Linkage(word=form.form, raw_tags=form.raw_tags, tags=form.tags) 

331 ) 

332 else: 

333 base_data.forms.extend(forms)