Coverage for src/wiktextract/extractor/zh/page.py: 77%

167 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 LevelNode, 

8 NodeKind, 

9 TemplateNode, 

10 WikiNode, 

11) 

12 

13from ...page import clean_node 

14from ...wxr_context import WiktextractContext 

15from ...wxr_logging import logger 

16from .descendant import extract_descendant_section 

17from .etymology import extract_etymology 

18from .gloss import extract_gloss 

19from .headword_line import extract_headword_line_template, extract_tlb_template 

20from .inflection import extract_inflections 

21from .linkage import extract_linkage_section 

22from .models import Form, Sense, WordEntry 

23from .note import extract_note 

24from .pronunciation import extract_pronunciation 

25from .section_titles import ( 

26 DESCENDANTS_TITLES, 

27 ETYMOLOGY_TITLES, 

28 IGNORED_TITLES, 

29 INFLECTION_TITLES, 

30 LINKAGE_TITLES, 

31 NOTES_TITLES, 

32 POS_TITLES, 

33 PRONUNCIATION_TITLES, 

34 TRANSLATIONS_TITLES, 

35) 

36from .translation import extract_translation 

37 

38 

39def parse_section( 

40 wxr: WiktextractContext, 

41 page_data: list[WordEntry], 

42 base_data: WordEntry, 

43 level_node: LevelNode, 

44) -> None: 

45 subtitle = clean_node(wxr, None, level_node.largs) 

46 # remove number suffix from subtitle 

47 subtitle = re.sub(r"\s*(?:(.+)|\d+)$", "", subtitle) 

48 wxr.wtp.start_subsection(subtitle) 

49 if subtitle in IGNORED_TITLES: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 pass 

51 elif subtitle in POS_TITLES: 

52 process_pos_block(wxr, page_data, base_data, level_node, subtitle) 

53 elif wxr.config.capture_etymologies and subtitle.startswith( 53 ↛ 56line 53 didn't jump to line 56 because the condition on line 53 was never true

54 tuple(ETYMOLOGY_TITLES) 

55 ): 

56 extract_etymology(wxr, page_data, base_data, level_node) 

57 elif wxr.config.capture_pronunciation and subtitle in PRONUNCIATION_TITLES: 

58 extract_pronunciation(wxr, page_data, base_data, level_node) 

59 elif wxr.config.capture_linkages and subtitle in LINKAGE_TITLES: 59 ↛ 85line 59 didn't jump to line 85 because the condition on line 59 was always true

60 is_descendant_section = False 

61 if subtitle in DESCENDANTS_TITLES: 

62 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 62 ↛ 72line 62 didn't jump to line 72 because the loop on line 62 didn't complete

63 if t_node.template_name.lower() in [ 63 ↛ 62line 63 didn't jump to line 62 because the condition on line 63 was always true

64 "desc", 

65 "descendant", 

66 "desctree", 

67 "descendants tree", 

68 "cjkv", 

69 ]: 

70 is_descendant_section = True 

71 break 

72 if is_descendant_section and wxr.config.capture_descendants: 

73 extract_descendant_section( 

74 wxr, 

75 level_node, 

76 page_data if len(page_data) > 0 else [base_data], 

77 ) 

78 elif not is_descendant_section: 78 ↛ 107line 78 didn't jump to line 107 because the condition on line 78 was always true

79 extract_linkage_section( 

80 wxr, 

81 page_data if len(page_data) > 0 else [base_data], 

82 level_node, 

83 LINKAGE_TITLES[subtitle], 

84 ) 

85 elif wxr.config.capture_translations and subtitle in TRANSLATIONS_TITLES: 

86 if len(page_data) == 0: 

87 page_data.append(base_data.model_copy(deep=True)) 

88 extract_translation(wxr, page_data, level_node) 

89 elif wxr.config.capture_inflections and subtitle in INFLECTION_TITLES: 

90 extract_inflections( 

91 wxr, page_data if len(page_data) > 0 else [base_data], level_node 

92 ) 

93 elif wxr.config.capture_descendants and subtitle in DESCENDANTS_TITLES: 

94 extract_descendant_section( 

95 wxr, level_node, page_data if len(page_data) > 0 else [base_data] 

96 ) 

97 elif subtitle in NOTES_TITLES: 

98 extract_note( 

99 wxr, page_data if len(page_data) > 0 else [base_data], level_node 

100 ) 

101 else: 

102 wxr.wtp.debug( 

103 f"Unhandled subtitle: {subtitle}", 

104 sortid="extractor/zh/page/parse_section/192", 

105 ) 

106 

107 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

108 parse_section(wxr, page_data, base_data, next_level_node) 

109 

110 for template in level_node.find_child(NodeKind.TEMPLATE): 

111 add_page_end_categories(wxr, page_data, template) 

112 

113 

114def process_pos_block( 

115 wxr: WiktextractContext, 

116 page_data: list[WordEntry], 

117 base_data: WordEntry, 

118 level_node: LevelNode, 

119 pos_text: str, 

120): 

121 pos_data = POS_TITLES[pos_text] 

122 pos_type = pos_data["pos"] 

123 base_data.pos = pos_type 

124 page_data.append(base_data.model_copy(deep=True)) 

125 page_data[-1].tags.extend(pos_data.get("tags", [])) 

126 for index, child in enumerate(level_node.filter_empty_str_child()): 

127 if isinstance(child, WikiNode): 

128 if index == 0 and isinstance(child, TemplateNode): 

129 extract_headword_line_template( 

130 wxr, page_data, child, base_data.lang_code 

131 ) 

132 process_soft_redirect_template(wxr, child, page_data) 

133 elif ( 133 ↛ 136line 133 didn't jump to line 136

134 isinstance(child, TemplateNode) and child.template_name == "tlb" 

135 ): 

136 extract_tlb_template(wxr, child, page_data) 

137 elif child.kind == NodeKind.LIST: 

138 extract_gloss(wxr, page_data, child, Sense()) 

139 

140 if len(page_data[-1].senses) == 0 and not level_node.contain_node( 

141 NodeKind.LIST 

142 ): 

143 # low quality pages don't put gloss in list 

144 gloss_text = clean_node( 

145 wxr, 

146 page_data[-1], 

147 list(level_node.invert_find_child(LEVEL_KIND_FLAGS)), 

148 ) 

149 if len(gloss_text) > 0: 149 ↛ 152line 149 didn't jump to line 152 because the condition on line 149 was always true

150 page_data[-1].senses.append(Sense(glosses=[gloss_text])) 

151 else: 

152 page_data[-1].senses.append(Sense(tags=["no-gloss"])) 

153 

154 

155def parse_page( 

156 wxr: WiktextractContext, page_title: str, page_text: str 

157) -> list[dict[str, Any]]: 

158 # page layout documents 

159 # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋 

160 # https://zh.wiktionary.org/wiki/Wiktionary:体例说明 

161 # https://zh.wiktionary.org/wiki/Wiktionary:格式手冊 

162 

163 # skip translation pages 

164 if page_title.endswith( 164 ↛ 167line 164 didn't jump to line 167 because the condition on line 164 was never true

165 tuple("/" + tr_title for tr_title in TRANSLATIONS_TITLES) 

166 ): 

167 return [] 

168 

169 if wxr.config.verbose: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 logger.info(f"Parsing page: {page_title}") 

171 wxr.config.word = page_title 

172 wxr.wtp.start_page(page_title) 

173 

174 # Parse the page, pre-expanding those templates that are likely to 

175 # influence parsing 

176 tree = wxr.wtp.parse(page_text, pre_expand=True) 

177 

178 page_data = [] 

179 for level2_node in tree.find_child(NodeKind.LEVEL2): 

180 categories = {} 

181 lang_name = clean_node(wxr, categories, level2_node.largs) 

182 lang_code = name_to_code(lang_name, "zh") 

183 if lang_code == "": 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true

184 wxr.wtp.warning( 

185 f"Unrecognized language name: {lang_name}", 

186 sortid="extractor/zh/page/parse_page/509", 

187 ) 

188 lang_code = "unknown" 

189 if ( 189 ↛ 193line 189 didn't jump to line 193

190 wxr.config.capture_language_codes is not None 

191 and lang_code not in wxr.config.capture_language_codes 

192 ): 

193 continue 

194 wxr.wtp.start_section(lang_name) 

195 base_data = WordEntry( 

196 word=wxr.wtp.title, 

197 lang_code=lang_code, 

198 lang=lang_name, 

199 pos="unknown", 

200 ) 

201 base_data.categories = categories.get("categories", []) 

202 for template_node in level2_node.find_child(NodeKind.TEMPLATE): 

203 if template_node.template_name == "zh-forms": 

204 process_zh_forms(wxr, base_data, template_node) 

205 

206 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

207 parse_section(wxr, page_data, base_data, level3_node) 

208 if not level2_node.contain_node(NodeKind.LEVEL3): 

209 page_data.append(base_data.model_copy(deep=True)) 

210 process_low_quality_page(wxr, level2_node, page_data) 

211 if page_data[-1] == base_data: 211 ↛ 212line 211 didn't jump to line 212 because the condition on line 211 was never true

212 page_data.pop() 

213 

214 for data in page_data: 

215 if len(data.senses) == 0: 

216 data.senses.append(Sense(tags=["no-gloss"])) 

217 

218 return [d.model_dump(exclude_defaults=True) for d in page_data] 

219 

220 

221def process_low_quality_page( 

222 wxr: WiktextractContext, 

223 level_node: WikiNode, 

224 page_data: list[WordEntry], 

225) -> None: 

226 is_soft_redirect = False 

227 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

228 if template_node.template_name in ("ja-see", "ja-see-kango", "zh-see"): 

229 process_soft_redirect_template(wxr, template_node, page_data) 

230 is_soft_redirect = True 

231 

232 if not is_soft_redirect: # only have a gloss text 

233 gloss_text = clean_node(wxr, page_data[-1], level_node.children) 

234 if len(gloss_text) > 0: 234 ↛ exitline 234 didn't return from function 'process_low_quality_page' because the condition on line 234 was always true

235 for cat in page_data[-1].categories: 

236 cat = cat.removeprefix(page_data[-1].lang).strip() 

237 if cat in POS_TITLES: 237 ↛ 235line 237 didn't jump to line 235 because the condition on line 237 was always true

238 pos_data = POS_TITLES[cat] 

239 page_data[-1].pos = pos_data["pos"] 

240 page_data[-1].tags.extend(pos_data.get("tags", [])) 

241 break 

242 page_data[-1].senses.append(Sense(glosses=[gloss_text])) 

243 

244 

245def process_soft_redirect_template( 

246 wxr: WiktextractContext, 

247 template_node: TemplateNode, 

248 page_data: list[WordEntry], 

249) -> None: 

250 # https://zh.wiktionary.org/wiki/Template:Ja-see 

251 # https://zh.wiktionary.org/wiki/Template:Ja-see-kango 

252 # https://zh.wiktionary.org/wiki/Template:Zh-see 

253 template_name = template_node.template_name.lower() 

254 if template_name == "zh-see": 

255 page_data[-1].redirects.append( 

256 clean_node(wxr, None, template_node.template_parameters.get(1, "")) 

257 ) 

258 elif template_name in ("ja-see", "ja-see-kango"): 

259 for key, value in template_node.template_parameters.items(): 

260 if isinstance(key, int): 260 ↛ 259line 260 didn't jump to line 259 because the condition on line 260 was always true

261 page_data[-1].redirects.append(clean_node(wxr, None, value)) 

262 

263 if page_data[-1].pos == "unknown": 

264 page_data[-1].pos = "soft-redirect" 

265 

266 

267def process_zh_forms( 

268 wxr: WiktextractContext, 

269 base_data: WordEntry, 

270 template_node: TemplateNode, 

271) -> None: 

272 # https://zh.wiktionary.org/wiki/Template:zh-forms 

273 for p_name, p_value in template_node.template_parameters.items(): 

274 if not isinstance(p_name, str): 

275 continue 

276 if re.fullmatch(r"s\d*", p_name): 

277 form_data = Form( 

278 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"] 

279 ) 

280 if len(form_data.form) > 0: 280 ↛ 273line 280 didn't jump to line 273 because the condition on line 280 was always true

281 base_data.forms.append(form_data) 

282 elif re.fullmatch(r"t\d+", p_name): 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true

283 form_data = Form( 

284 form=clean_node(wxr, None, p_value), 

285 tags=["Traditional Chinese"], 

286 ) 

287 if len(form_data.form) > 0: 

288 base_data.forms.append(form_data) 

289 elif p_name == "alt": 

290 for form_text in clean_node(wxr, None, p_value).split(","): 

291 texts = form_text.split("-") 

292 form_data = Form(form=texts[0], raw_tags=texts[1:]) 

293 if len(form_data.form) > 0: 293 ↛ 290line 293 didn't jump to line 290 because the condition on line 293 was always true

294 base_data.forms.append(form_data) 

295 elif p_name == "lit": 

296 lit = clean_node(wxr, None, p_value) 

297 base_data.literal_meaning = lit 

298 

299 

300# https://zh.wiktionary.org/wiki/Template:Zh-cat 

301# https://zh.wiktionary.org/wiki/Template:Catlangname 

302CATEGORY_TEMPLATES = frozenset(["zh-cat", "cln", "catlangname", "c", "topics"]) 

303 

304 

305def add_page_end_categories( 

306 wxr: WiktextractContext, page_data: list[WordEntry], template: TemplateNode 

307) -> None: 

308 if template.template_name.lower() in CATEGORY_TEMPLATES: 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true

309 categories = {} 

310 clean_node(wxr, categories, template) 

311 for data in page_data: 

312 if data.lang_code == page_data[-1].lang_code: 

313 data.categories.extend(categories.get("categories", []))