Coverage for src/wiktextract/extractor/zh/page.py: 77%

1import re

2from typing import Any

4from mediawiki_langcodes import name_to_code

5from wikitextprocessor.parser import (

6 LEVEL_KIND_FLAGS,

7 LevelNode,

8 NodeKind,

9 TemplateNode,

10 WikiNode,

11)

13from ...page import clean_node

14from ...wxr_context import WiktextractContext

15from ...wxr_logging import logger

16from .descendant import extract_descendant_section

17from .etymology import extract_etymology

18from .gloss import extract_gloss

19from .headword_line import extract_headword_line_template, extract_tlb_template

20from .inflection import extract_inflections

21from .linkage import extract_linkage_section

22from .models import Form, Sense, WordEntry

23from .note import extract_note

24from .pronunciation import extract_pronunciation

25from .section_titles import (

26 DESCENDANTS_TITLES,

27 ETYMOLOGY_TITLES,

28 IGNORED_TITLES,

29 INFLECTION_TITLES,

30 LINKAGE_TITLES,

31 NOTES_TITLES,

32 POS_TITLES,

33 PRONUNCIATION_TITLES,

34 TRANSLATIONS_TITLES,

35)

36from .translation import extract_translation

39def parse_section(

40 wxr: WiktextractContext,

41 page_data: list[WordEntry],

42 base_data: WordEntry,

43 level_node: LevelNode,

44) -> None:

45 subtitle = clean_node(wxr, None, level_node.largs)

46 # remove number suffix from subtitle

47 subtitle = re.sub(r"\s*(?:（.+）|\d+)$", "", subtitle)

48 wxr.wtp.start_subsection(subtitle)

49 if subtitle in IGNORED_TITLES: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 pass

51 elif subtitle in POS_TITLES:

52 process_pos_block(wxr, page_data, base_data, level_node, subtitle)

53 elif wxr.config.capture_etymologies and subtitle.startswith( 53 ↛ 56line 53 didn't jump to line 56 because the condition on line 53 was never true

54 tuple(ETYMOLOGY_TITLES)

55 ):

56 extract_etymology(wxr, page_data, base_data, level_node)

57 elif wxr.config.capture_pronunciation and subtitle in PRONUNCIATION_TITLES:

58 extract_pronunciation(wxr, page_data, base_data, level_node)

59 elif wxr.config.capture_linkages and subtitle in LINKAGE_TITLES: 59 ↛ 85line 59 didn't jump to line 85 because the condition on line 59 was always true

60 is_descendant_section = False

61 if subtitle in DESCENDANTS_TITLES:

62 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 62 ↛ 72line 62 didn't jump to line 72 because the loop on line 62 didn't complete

63 if t_node.template_name.lower() in [ 63 ↛ 62line 63 didn't jump to line 62 because the condition on line 63 was always true

64 "desc",

65 "descendant",

66 "desctree",

67 "descendants tree",

68 "cjkv",

69 ]:

70 is_descendant_section = True

71 break

72 if is_descendant_section and wxr.config.capture_descendants:

73 extract_descendant_section(

74 wxr,

75 level_node,

76 page_data if len(page_data) > 0 else [base_data],

77 )

78 elif not is_descendant_section: 78 ↛ 107line 78 didn't jump to line 107 because the condition on line 78 was always true

79 extract_linkage_section(

80 wxr,

81 page_data if len(page_data) > 0 else [base_data],

82 level_node,

83 LINKAGE_TITLES[subtitle],

84 )

85 elif wxr.config.capture_translations and subtitle in TRANSLATIONS_TITLES:

86 if len(page_data) == 0:

87 page_data.append(base_data.model_copy(deep=True))

88 extract_translation(wxr, page_data, level_node)

89 elif wxr.config.capture_inflections and subtitle in INFLECTION_TITLES:

90 extract_inflections(

91 wxr, page_data if len(page_data) > 0 else [base_data], level_node

92 )

93 elif wxr.config.capture_descendants and subtitle in DESCENDANTS_TITLES:

94 extract_descendant_section(

95 wxr, level_node, page_data if len(page_data) > 0 else [base_data]

96 )

97 elif subtitle in NOTES_TITLES:

98 extract_note(

99 wxr, page_data if len(page_data) > 0 else [base_data], level_node

100 )

101 else:

102 wxr.wtp.debug(

103 f"Unhandled subtitle: {subtitle}",

104 sortid="extractor/zh/page/parse_section/192",

105 )

106

107 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):

108 parse_section(wxr, page_data, base_data, next_level_node)

109

110 for template in level_node.find_child(NodeKind.TEMPLATE):

111 add_page_end_categories(wxr, page_data, template)

112

113

114def process_pos_block(

115 wxr: WiktextractContext,

116 page_data: list[WordEntry],

117 base_data: WordEntry,

118 level_node: LevelNode,

119 pos_text: str,

120):

121 pos_data = POS_TITLES[pos_text]

122 pos_type = pos_data["pos"]

123 base_data.pos = pos_type

124 page_data.append(base_data.model_copy(deep=True))

125 page_data[-1].tags.extend(pos_data.get("tags", []))

126 for index, child in enumerate(level_node.filter_empty_str_child()):

127 if isinstance(child, WikiNode):

128 if index == 0 and isinstance(child, TemplateNode):

129 extract_headword_line_template(

130 wxr, page_data, child, base_data.lang_code

131 )

132 process_soft_redirect_template(wxr, child, page_data)

133 elif ( 133 ↛ 136line 133 didn't jump to line 136 because the condition on line 133 was never true

134 isinstance(child, TemplateNode) and child.template_name == "tlb"

135 ):

136 extract_tlb_template(wxr, child, page_data)

137 elif child.kind == NodeKind.LIST:

138 extract_gloss(wxr, page_data, child, Sense())

139

140 if len(page_data[-1].senses) == 0 and not level_node.contain_node(

141 NodeKind.LIST

142 ):

143 # low quality pages don't put gloss in list

144 gloss_text = clean_node(

145 wxr,

146 page_data[-1],

147 list(level_node.invert_find_child(LEVEL_KIND_FLAGS)),

148 )

149 if len(gloss_text) > 0: 149 ↛ 152line 149 didn't jump to line 152 because the condition on line 149 was always true

150 page_data[-1].senses.append(Sense(glosses=[gloss_text]))

151 else:

152 page_data[-1].senses.append(Sense(tags=["no-gloss"]))

153

154

155def parse_page(

156 wxr: WiktextractContext, page_title: str, page_text: str

157) -> list[dict[str, Any]]:

158 # page layout documents

159 # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋

160 # https://zh.wiktionary.org/wiki/Wiktionary:体例说明

161 # https://zh.wiktionary.org/wiki/Wiktionary:格式手冊

162

163 # skip translation pages

164 if page_title.endswith( 164 ↛ 167line 164 didn't jump to line 167 because the condition on line 164 was never true

165 tuple("/" + tr_title for tr_title in TRANSLATIONS_TITLES)

166 ):

167 return []

168

169 if wxr.config.verbose: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 logger.info(f"Parsing page: {page_title}")

171 wxr.config.word = page_title

172 wxr.wtp.start_page(page_title)

173

174 # Parse the page, pre-expanding those templates that are likely to

175 # influence parsing

176 tree = wxr.wtp.parse(page_text, pre_expand=True)

177

178 page_data = []

179 for level2_node in tree.find_child(NodeKind.LEVEL2):

180 categories = {}

181 lang_name = clean_node(wxr, categories, level2_node.largs)

182 lang_code = name_to_code(lang_name, "zh")

183 if lang_code == "": 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true

184 wxr.wtp.warning(

185 f"Unrecognized language name: {lang_name}",

186 sortid="extractor/zh/page/parse_page/509",

187 )

188 lang_code = "unknown"

189 if ( 189 ↛ 193line 189 didn't jump to line 193 because the condition on line 189 was never true

190 wxr.config.capture_language_codes is not None

191 and lang_code not in wxr.config.capture_language_codes

192 ):

193 continue

194 wxr.wtp.start_section(lang_name)

195 base_data = WordEntry(

196 word=wxr.wtp.title,

197 lang_code=lang_code,

198 lang=lang_name,

199 pos="unknown",

200 )

201 base_data.categories = categories.get("categories", [])

202 for template_node in level2_node.find_child(NodeKind.TEMPLATE):

203 if template_node.template_name == "zh-forms":

204 process_zh_forms(wxr, base_data, template_node)

205

206 for level3_node in level2_node.find_child(NodeKind.LEVEL3):

207 parse_section(wxr, page_data, base_data, level3_node)

208 if not level2_node.contain_node(NodeKind.LEVEL3):

209 page_data.append(base_data.model_copy(deep=True))

210 process_low_quality_page(wxr, level2_node, page_data)

211 if page_data[-1] == base_data: 211 ↛ 212line 211 didn't jump to line 212 because the condition on line 211 was never true

212 page_data.pop()

213

214 for data in page_data:

215 if len(data.senses) == 0:

216 data.senses.append(Sense(tags=["no-gloss"]))

217

218 return [d.model_dump(exclude_defaults=True) for d in page_data]

219

220

221def process_low_quality_page(

222 wxr: WiktextractContext,

223 level_node: WikiNode,

224 page_data: list[WordEntry],

225) -> None:

226 is_soft_redirect = False

227 for template_node in level_node.find_child(NodeKind.TEMPLATE):

228 if template_node.template_name in ("ja-see", "ja-see-kango", "zh-see"):

229 process_soft_redirect_template(wxr, template_node, page_data)

230 is_soft_redirect = True

231

232 if not is_soft_redirect: # only have a gloss text

233 gloss_text = clean_node(wxr, page_data[-1], level_node.children)

234 if len(gloss_text) > 0: 234 ↛ exitline 234 didn't return from function 'process_low_quality_page' because the condition on line 234 was always true

235 for cat in page_data[-1].categories:

236 cat = cat.removeprefix(page_data[-1].lang).strip()

237 if cat in POS_TITLES: 237 ↛ 235line 237 didn't jump to line 235 because the condition on line 237 was always true

238 pos_data = POS_TITLES[cat]

239 page_data[-1].pos = pos_data["pos"]

240 page_data[-1].tags.extend(pos_data.get("tags", []))

241 break

242 page_data[-1].senses.append(Sense(glosses=[gloss_text]))

243

244

245def process_soft_redirect_template(

246 wxr: WiktextractContext,

247 template_node: TemplateNode,

248 page_data: list[WordEntry],

249) -> None:

250 # https://zh.wiktionary.org/wiki/Template:Ja-see

251 # https://zh.wiktionary.org/wiki/Template:Ja-see-kango

252 # https://zh.wiktionary.org/wiki/Template:Zh-see

253 template_name = template_node.template_name.lower()

254 if template_name == "zh-see":

255 page_data[-1].redirects.append(

256 clean_node(wxr, None, template_node.template_parameters.get(1, ""))

257 )

258 elif template_name in ("ja-see", "ja-see-kango"):

259 for key, value in template_node.template_parameters.items():

260 if isinstance(key, int): 260 ↛ 259line 260 didn't jump to line 259 because the condition on line 260 was always true

261 page_data[-1].redirects.append(clean_node(wxr, None, value))

262

263 if page_data[-1].pos == "unknown":

264 page_data[-1].pos = "soft-redirect"

265

266

267def process_zh_forms(

268 wxr: WiktextractContext,

269 base_data: WordEntry,

270 template_node: TemplateNode,

271) -> None:

272 # https://zh.wiktionary.org/wiki/Template:zh-forms

273 for p_name, p_value in template_node.template_parameters.items():

274 if not isinstance(p_name, str):

275 continue

276 if re.fullmatch(r"s\d*", p_name):

277 form_data = Form(

278 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"]

279 )

280 if len(form_data.form) > 0: 280 ↛ 273line 280 didn't jump to line 273 because the condition on line 280 was always true

281 base_data.forms.append(form_data)

282 elif re.fullmatch(r"t\d+", p_name): 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true

283 form_data = Form(

284 form=clean_node(wxr, None, p_value),

285 tags=["Traditional Chinese"],

286 )

287 if len(form_data.form) > 0:

288 base_data.forms.append(form_data)

289 elif p_name == "alt":

290 for form_text in clean_node(wxr, None, p_value).split(","):

291 texts = form_text.split("-")

292 form_data = Form(form=texts[0], raw_tags=texts[1:])

293 if len(form_data.form) > 0: 293 ↛ 290line 293 didn't jump to line 290 because the condition on line 293 was always true

294 base_data.forms.append(form_data)

295 elif p_name == "lit":

296 lit = clean_node(wxr, None, p_value)

297 base_data.literal_meaning = lit

298

299

300# https://zh.wiktionary.org/wiki/Template:Zh-cat

301# https://zh.wiktionary.org/wiki/Template:Catlangname

302CATEGORY_TEMPLATES = frozenset(["zh-cat", "cln", "catlangname", "c", "topics"])

303

304

305def add_page_end_categories(

306 wxr: WiktextractContext, page_data: list[WordEntry], template: TemplateNode

307) -> None:

308 if template.template_name.lower() in CATEGORY_TEMPLATES: 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true

309 categories = {}

310 clean_node(wxr, categories, template)

311 for data in page_data:

312 if data.lang_code == page_data[-1].lang_code:

313 data.categories.extend(categories.get("categories", []))