Coverage for src/wiktextract/page.py: 72%

266 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1# Code for parsing information from a single Wiktionary page. 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import re 

6from collections import defaultdict 

7from copy import copy 

8from typing import Any, Callable, Optional, Union 

9 

10from mediawiki_langcodes import name_to_code 

11from wikitextprocessor.core import ( 

12 NamespaceDataEntry, 

13 PostTemplateFnCallable, 

14 TemplateArgs, 

15 TemplateFnCallable, 

16) 

17from wikitextprocessor.node_expand import NodeHandlerFnCallable 

18from wikitextprocessor.parser import GeneralNode, NodeKind, WikiNode 

19 

20from .clean import clean_value 

21from .datautils import data_append, data_extend 

22from .import_utils import import_extractor_module 

23from .wxr_context import WiktextractContext 

24 

25# NodeKind values for subtitles 

26LEVEL_KINDS = { 

27 NodeKind.LEVEL2, 

28 NodeKind.LEVEL3, 

29 NodeKind.LEVEL4, 

30 NodeKind.LEVEL5, 

31 NodeKind.LEVEL6, 

32} 

33 

34 

35def parse_page( 

36 wxr: WiktextractContext, page_title: str, page_text: str 

37) -> list[dict[str, Any]]: 

38 """Parses the text of a Wiktionary page and returns a list of 

39 dictionaries, one for each word/part-of-speech defined on the page 

40 for the languages specified by ``capture_language_codes`` (None means 

41 all available languages). ``word`` is page title, and ``text`` is 

42 page text in Wikimedia format. Other arguments indicate what is 

43 captured.""" 

44 page_extractor_mod = import_extractor_module(wxr.wtp.lang_code, "page") 

45 page_data = page_extractor_mod.parse_page(wxr, page_title, page_text) 

46 if wxr.config.extract_thesaurus_pages: 46 ↛ 48line 46 didn't jump to line 48 because the condition on line 46 was always true

47 inject_linkages(wxr, page_data) 

48 if wxr.config.dump_file_lang_code == "en": 48 ↛ 50line 48 didn't jump to line 50 because the condition on line 48 was always true

49 process_categories(wxr, page_data) 

50 remove_duplicate_data(page_data) 

51 return page_data 

52 

53 

54def is_panel_template(wxr: WiktextractContext, template_name: str) -> bool: 

55 """Checks if `Template_name` is a known panel template name (i.e., one that 

56 produces an infobox in Wiktionary, but this also recognizes certain other 

57 templates that we do not wish to expand).""" 

58 page_extractor_mod = import_extractor_module(wxr.wtp.lang_code, "page") 

59 if ( 59 ↛ 63line 59 didn't jump to line 63 because the condition on line 59 was never true

60 hasattr(page_extractor_mod, "PANEL_TEMPLATES") 

61 and template_name in page_extractor_mod.PANEL_TEMPLATES 

62 ): 

63 return True 

64 if hasattr( 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was never true

65 page_extractor_mod, "PANEL_PREFIXES" 

66 ) and template_name.startswith(tuple(page_extractor_mod.PANEL_PREFIXES)): 

67 return True 

68 return False 

69 

70 

71def recursively_extract( 

72 contents: Union[WikiNode, str, list[Union[str, WikiNode]]], 

73 fn: Callable[[Union[WikiNode, list[WikiNode]]], bool], 

74) -> tuple[list[Union[str, WikiNode]], list[Union[str, WikiNode]]]: 

75 """Recursively extracts elements from contents for which ``fn`` returns 

76 True. This returns two lists, the extracted elements and the remaining 

77 content (with the extracted elements removed at each level). Only 

78 WikiNode objects can be extracted.""" 

79 # If contents is a list, process each element separately 

80 extracted = [] 

81 new_contents = [] 

82 if isinstance(contents, (list, tuple)): 

83 for x in contents: 

84 e1, c1 = recursively_extract(x, fn) 

85 extracted.extend(e1) 

86 new_contents.extend(c1) 

87 return extracted, new_contents 

88 # If content is not WikiNode, just return it as new contents. 

89 if not isinstance(contents, WikiNode): 

90 return [], [contents] 

91 # Check if this content should be extracted 

92 if fn(contents): 

93 return [contents], [] 

94 # Otherwise content is WikiNode, and we must recurse into it. 

95 kind = contents.kind 

96 new_node = copy(contents) 

97 new_node.children = [] 

98 new_node.sarg = "" 

99 new_node.largs = [] 

100 new_node.attrs = {} 

101 new_contents.append(new_node) 

102 if kind in LEVEL_KINDS or kind == NodeKind.LINK: 

103 # Process args and children 

104 new_args = [] 

105 for arg in contents.largs: 

106 e1, c1 = recursively_extract(arg, fn) 

107 new_args.append(c1) 

108 extracted.extend(e1) 

109 new_node.largs = new_args 

110 e1, c1 = recursively_extract(contents.children, fn) 

111 extracted.extend(e1) 

112 new_node.children = c1 

113 elif kind in { 

114 NodeKind.ITALIC, 

115 NodeKind.BOLD, 

116 NodeKind.TABLE, 

117 NodeKind.TABLE_CAPTION, 

118 NodeKind.TABLE_ROW, 

119 NodeKind.TABLE_HEADER_CELL, 

120 NodeKind.TABLE_CELL, 

121 NodeKind.PRE, 

122 NodeKind.PREFORMATTED, 

123 }: 

124 # Process only children 

125 e1, c1 = recursively_extract(contents.children, fn) 

126 extracted.extend(e1) 

127 new_node.children = c1 

128 elif kind in (NodeKind.HLINE,): 128 ↛ 130line 128 didn't jump to line 130 because the condition on line 128 was never true

129 # No arguments or children 

130 pass 

131 elif kind in (NodeKind.LIST, NodeKind.LIST_ITEM): 

132 # Keep args as-is, process children 

133 new_node.sarg = contents.sarg 

134 e1, c1 = recursively_extract(contents.children, fn) 

135 extracted.extend(e1) 

136 new_node.children = c1 

137 elif kind in { 

138 NodeKind.TEMPLATE, 

139 NodeKind.TEMPLATE_ARG, 

140 NodeKind.PARSER_FN, 

141 NodeKind.URL, 

142 }: 

143 # Process only args 

144 new_args = [] 

145 for arg in contents.largs: 

146 e1, c1 = recursively_extract(arg, fn) 

147 new_args.append(c1) 

148 extracted.extend(e1) 

149 new_node.largs = new_args 

150 elif kind == NodeKind.HTML: 150 ↛ 158line 150 didn't jump to line 158 because the condition on line 150 was always true

151 # Keep attrs and args as-is, process children 

152 new_node.attrs = contents.attrs 

153 new_node.sarg = contents.sarg 

154 e1, c1 = recursively_extract(contents.children, fn) 

155 extracted.extend(e1) 

156 new_node.children = c1 

157 else: 

158 raise RuntimeError(f"recursively_extract: unhandled kind {kind}") 

159 return extracted, new_contents 

160 

161 

162def inject_linkages(wxr: WiktextractContext, page_data: list[dict]) -> None: 

163 # Inject linkages from thesaurus entries 

164 from .thesaurus import search_thesaurus 

165 

166 local_thesaurus_ns = wxr.wtp.NAMESPACE_DATA.get("Thesaurus", {}).get("name") # type: ignore[call-overload] 

167 for data in page_data: 

168 if "pos" not in data: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 continue 

170 word = data["word"] 

171 lang_code = data["lang_code"] 

172 pos = data["pos"] 

173 for term in search_thesaurus( 173 ↛ 179line 173 didn't jump to line 179 because the loop on line 173 never started

174 wxr.thesaurus_db_conn, # type:ignore[arg-type] 

175 word, 

176 lang_code, 

177 pos, # type: ignore[arg-type] 

178 ): 

179 for dt in data.get(term.linkage, ()): 

180 if dt.get("word") == term.term and ( 

181 not term.sense or dt.get("sense") == term.sense 

182 ): 

183 break 

184 else: 

185 dt = { 

186 "word": term.term, 

187 "source": f"{local_thesaurus_ns}:{word}", 

188 } 

189 if len(term.sense) > 0: 

190 dt["sense"] = term.sense 

191 if len(term.tags) > 0: 

192 dt["tags"] = term.tags 

193 if len(term.raw_tags) > 0: 

194 dt["raw_tags"] = term.raw_tags 

195 if len(term.topics) > 0: 

196 dt["topics"] = term.topics 

197 if len(term.roman) > 0: 

198 dt["roman"] = term.roman 

199 data_append(data, term.linkage, dt) 

200 

201 

202def process_categories( 

203 wxr: WiktextractContext, page_data: list[dict[str, Any]] 

204) -> None: 

205 # Categories are not otherwise disambiguated, but if there is only 

206 # one sense and only one data in ret for the same language, move 

207 # categories to the only sense. Note that categories are commonly 

208 # specified for the page, and thus if we have multiple data in 

209 # ret, we don't know which one they belong to (not even which 

210 # language necessarily?). 

211 # XXX can Category links be specified globally (i.e., in a different 

212 # language?) 

213 by_lang = defaultdict(list) 

214 for data in page_data: 

215 by_lang[data["lang"]].append(data) 

216 for la, lst in by_lang.items(): 

217 if len(lst) > 1: 217 ↛ 221line 217 didn't jump to line 221 because the condition on line 217 was never true

218 # Propagate categories from the last entry for the language to 

219 # its other entries. It is common for them to only be specified 

220 # in the last part-of-speech. 

221 last = lst[-1] 

222 for field in ("categories",): 

223 if field not in last: 

224 continue 

225 vals = last[field] 

226 for data in lst[:-1]: 

227 assert data is not last 

228 assert data.get(field) is not vals 

229 if data.get("alt_of") or data.get("form_of"): 

230 continue # Don't add to alt-of/form-of entries 

231 data_extend(data, field, vals) 

232 continue 

233 if len(lst) != 1: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true

234 continue 

235 data = lst[0] 

236 senses = data.get("senses") or [] 

237 if len(senses) != 1: 

238 continue 

239 # Only one sense for this language. Move categories and certain other 

240 # data to sense. 

241 for field in ("categories", "topics", "wikidata", "wikipedia"): 

242 if field in data: 

243 v = data[field] 

244 del data[field] 

245 data_extend(senses[0], field, v) 

246 

247 # If the last part-of-speech of the last language (i.e., last item in "ret") 

248 # has categories or topics not bound to a sense, propagate those 

249 # categories and topics to all datas on "ret". It is common for categories 

250 # to be specified at the end of an article. Apparently these can also 

251 # apply to different languages. 

252 if len(page_data) > 1: 252 ↛ 253line 252 didn't jump to line 253 because the condition on line 252 was never true

253 last = page_data[-1] 

254 for field in ("categories",): 

255 if field not in last: 

256 continue 

257 lst = last[field] 

258 for data in page_data[:-1]: 

259 if data.get("form_of") or data.get("alt_of"): 

260 continue # Don't add to form_of or alt_of entries 

261 data_extend(data, field, lst) 

262 

263 # Remove category links that start with a language name from entries for 

264 # different languages 

265 rhymes_ns_prefix = ( 

266 wxr.wtp.NAMESPACE_DATA.get("Rhymes", {}).get("name", "") + ":" # type: ignore[call-overload] 

267 ) 

268 for data in page_data: 

269 lang_code = data.get("lang_code") 

270 cats = data.get("categories", []) 

271 new_cats = [] 

272 for cat in cats: 

273 no_prefix_cat = cat.removeprefix(rhymes_ns_prefix) 

274 cat_lang = no_prefix_cat.split(maxsplit=1)[0].split( 

275 "/", maxsplit=1 

276 )[0] 

277 cat_lang_code = name_to_code(cat_lang, "en") 

278 if ( 

279 cat_lang_code != "" 

280 and cat_lang_code != lang_code 

281 and not (lang_code == "mul" and cat_lang_code == "en") 

282 ): 

283 continue 

284 new_cats.append(cat) 

285 if len(new_cats) == 0: 

286 if "categories" in data: 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true

287 del data["categories"] 

288 else: 

289 data["categories"] = new_cats 

290 

291 

292def remove_duplicate_data(page_data: dict) -> None: 

293 # Remove duplicates from tags, categories, etc. 

294 for data in page_data: 

295 for field in ("categories", "topics", "tags", "wikidata", "wikipedia"): 

296 if field in data: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true

297 data[field] = list(sorted(set(data[field]))) 

298 for sense in data.get("senses", ()): 

299 if field in sense: 

300 sense[field] = list(sorted(set(sense[field]))) 

301 

302 # If raw_glosses is identical to glosses, remove it 

303 # If "empty-gloss" in tags and there are glosses, remove the tag 

304 for data in page_data: 

305 for s in data.get("senses", []): 

306 rglosses = s.get("raw_glosses", ()) 

307 if not rglosses: 

308 continue 

309 sglosses = s.get("glosses", ()) 

310 if sglosses: 310 ↛ 314line 310 didn't jump to line 314 because the condition on line 310 was always true

311 tags = s.get("tags", ()) 

312 while "empty-gloss" in s.get("tags", ()): 312 ↛ 313line 312 didn't jump to line 313 because the condition on line 312 was never true

313 tags.remove("empty-gloss") 

314 if len(rglosses) != len(sglosses): 314 ↛ 315line 314 didn't jump to line 315 because the condition on line 314 was never true

315 continue 

316 same = True 

317 for rg, sg in zip(rglosses, sglosses): 

318 if rg != sg: 

319 same = False 

320 break 

321 if same: 

322 del s["raw_glosses"] 

323 

324 

325def clean_node( 

326 wxr: WiktextractContext, 

327 sense_data: Optional[Any], 

328 wikinode: GeneralNode, 

329 template_fn: Optional[TemplateFnCallable] = None, 

330 post_template_fn: Optional[PostTemplateFnCallable] = None, 

331 node_handler_fn: Optional[NodeHandlerFnCallable] = None, 

332 collect_links: bool = False, 

333 no_strip=False, 

334 no_html_strip=False, 

335) -> str: 

336 """ 

337 Expands node or nodes to text, cleaning up HTML tags and duplicate spaces. 

338 

339 If `sense_data` is a dictionary, expanded category links will be added to 

340 it under the `categories` key. And if `collect_link` is `True`, expanded 

341 links will be added to the `links` key. 

342 """ 

343 

344 # print("CLEAN_NODE:", repr(value)) 

345 def clean_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

346 if template_fn is not None: 

347 return template_fn(name, ht) 

348 if is_panel_template(wxr, name): 

349 return "" 

350 return None 

351 

352 def clean_node_handler_fn_default( 

353 node: WikiNode, 

354 ) -> Optional[list[Union[str, WikiNode]]]: 

355 assert isinstance(node, WikiNode) 

356 kind = node.kind 

357 if kind in { 

358 NodeKind.TABLE_CELL, 

359 NodeKind.TABLE_HEADER_CELL, 

360 }: 

361 return node.children 

362 return None 

363 

364 if node_handler_fn is not None: 

365 # override clean_node_handler_fn, the def above can't be accessed 

366 clean_node_handler_fn = node_handler_fn 

367 else: 

368 clean_node_handler_fn = clean_node_handler_fn_default 

369 

370 # print("clean_node: value={!r}".format(value)) 

371 v = wxr.wtp.node_to_html( 

372 wikinode, 

373 node_handler_fn=clean_node_handler_fn, 

374 template_fn=template_fn, 

375 post_template_fn=post_template_fn, 

376 ) 

377 # print("clean_node: v={!r}".format(v)) 

378 

379 # Capture categories if sense_data has been given. We also track 

380 # Lua execution errors here. 

381 # If collect_links=True (for glosses), capture links 

382 category_ns_data: NamespaceDataEntry = wxr.wtp.NAMESPACE_DATA.get( 

383 "Category", 

384 {}, # type: ignore[typeddict-item] 

385 ) 

386 category_ns_names: set[str] = {category_ns_data.get("name")} | set( 

387 category_ns_data.get("aliases") # type:ignore[assignment,arg-type] 

388 ) 

389 category_ns_names |= {"Category", "category"} 

390 category_names_pattern = rf"(?:{'|'.join(category_ns_names)})" 

391 if sense_data is not None: 

392 # Check for Lua execution error 

393 if '<strong class="error">Lua execution error' in v: 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true

394 data_append(sense_data, "tags", "error-lua-exec") 

395 if '<strong class="error">Lua timeout error' in v: 395 ↛ 396line 395 didn't jump to line 396 because the condition on line 395 was never true

396 data_append(sense_data, "tags", "error-lua-timeout") 

397 # Capture Category tags 

398 if not collect_links: 

399 for m in re.finditer( 

400 rf"(?is)\[\[:?\s*{category_names_pattern}\s*:([^]|]+)", 

401 v, 

402 ): 

403 cat = clean_value(wxr, m.group(1)) 

404 cat = re.sub(r"\s+", " ", cat) 

405 cat = cat.strip() 

406 if not cat: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true

407 continue 

408 if not sense_data_has_value(sense_data, "categories", cat): 

409 data_append(sense_data, "categories", cat) 

410 else: 

411 for m in re.finditer( 

412 r"(?is)\[\[:?(\s*([^][|:]+):)?\s*([^]|]+)(\|([^]|]+))?\]\]", 

413 # 1 2 3 4 5 

414 v, 

415 ): 

416 # Add here other stuff different "Something:restofthelink" 

417 # things; 

418 if m.group(2) and m.group(2).strip() in category_ns_names: 

419 cat = clean_value(wxr, m.group(3)) 

420 cat = re.sub(r"\s+", " ", cat) 

421 cat = cat.strip() 

422 if not cat: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 continue 

424 if not sense_data_has_value(sense_data, "categories", cat): 424 ↛ 411line 424 didn't jump to line 411 because the condition on line 424 was always true

425 data_append(sense_data, "categories", cat) 

426 elif not m.group(1): 426 ↛ 411line 426 didn't jump to line 411 because the condition on line 426 was always true

427 if m.group(5): 

428 ltext = clean_value(wxr, m.group(5)) 

429 ltarget = clean_value(wxr, m.group(3)) 

430 elif not m.group(3): 430 ↛ 431line 430 didn't jump to line 431 because the condition on line 430 was never true

431 continue 

432 else: 

433 txt = clean_value(wxr, m.group(3)) 

434 ltext = txt 

435 ltarget = txt 

436 ltarget = re.sub(r"\s+", " ", ltarget) 

437 ltarget = ltarget.strip() 

438 ltext = re.sub(r"\s+", " ", ltext) 

439 ltext = ltext.strip() 

440 if not ltext and not ltarget: 440 ↛ 441line 440 didn't jump to line 441 because the condition on line 440 was never true

441 continue 

442 if not ltext and ltarget: 442 ↛ 443line 442 didn't jump to line 443 because the condition on line 442 was never true

443 ltext = ltarget 

444 ltuple = (ltext, ltarget) 

445 if not sense_data_has_value(sense_data, "links", ltuple): 445 ↛ 411line 445 didn't jump to line 411 because the condition on line 445 was always true

446 data_append(sense_data, "links", ltuple) 

447 

448 v = clean_value(wxr, v, no_strip=no_strip, no_html_strip=no_html_strip) 

449 # print("After clean_value:", repr(v)) 

450 

451 # Strip any unhandled templates and other stuff. This is mostly intended 

452 # to clean up erroneous codings in the original text. 

453 # v = re.sub(r"(?s)\{\{.*", "", v) 

454 # Some templates create <sup>(Category: ...)</sup>; remove 

455 v = re.sub( 

456 rf"(?si)\s*(?:<sup>)?\({category_names_pattern}:[^)]+\)(?:</sup>)?", 

457 "", 

458 v, 

459 ) 

460 # Some templates create question mark in <sup>, e.g., 

461 # some Korean Hanja form 

462 v = re.sub(r"\^\?", "", v) 

463 return v 

464 

465 

466def sense_data_has_value( 

467 sense_data: dict[str, Any], name: str, value: Any 

468) -> bool: 

469 """ 

470 Return True if `sense_data` has value in the attribute `name`'s value or 

471 in the value of key `name` if `sense_date` is dictionary. 

472 """ 

473 if hasattr(sense_data, name): 

474 return value in getattr(sense_data, name) 

475 elif isinstance(sense_data, dict): 475 ↛ 477line 475 didn't jump to line 477 because the condition on line 475 was always true

476 return value in sense_data.get(name, ()) # type:ignore[operator] 

477 return False