Coverage for src/wiktextract/clean.py: 84%

320 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1# This file contains code to clean Wiktionary annotations from a string and to 

2# produce plain text from it, typically for glossary entries but this is also 

3# called for various other data to produce clean strings. 

4# 

5# This file also contains code for cleaning qualifiers for the "tags" field. 

6# 

7# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

8 

9import html 

10import re 

11import unicodedata 

12from typing import Callable, Optional, Union 

13 

14from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST, URL_STARTS 

15from wikitextprocessor.core import NamespaceDataEntry, TemplateArgs 

16from wikitextprocessor.parser import TemplateParameters 

17 

18from .wxr_context import WiktextractContext 

19 

20###################################################################### 

21# Cleaning values into plain text. 

22###################################################################### 

23 

24superscript_ht: dict[str, str] = { 

25 "0": "⁰", 

26 "1": "¹", 

27 "2": "²", 

28 "3": "³", 

29 "4": "⁴", 

30 "5": "⁵", 

31 "6": "⁶", 

32 "7": "⁷", 

33 "8": "⁸", 

34 "9": "⁹", 

35 "+": "⁺", 

36 "-": "⁻", 

37 "−": "⁻", 

38 "‐": "⁻", 

39 "–": "⁻", 

40 "—": "⁻", 

41 "一": "⁻", 

42 "=": "⁼", 

43 "(": "⁽", 

44 ")": "⁾", 

45 "A": "ᴬ", 

46 "B": "ᴮ", 

47 "D": "ᴰ", 

48 "E": "ᴱ", 

49 "G": "ᴳ", 

50 "H": "ᴴ", 

51 "I": "ᴵ", 

52 "J": "ᴶ", 

53 "K": "ᴷ", 

54 "L": "ᴸ", 

55 "M": "ᴹ", 

56 "N": "ᴺ", 

57 "O": "ᴼ", 

58 "P": "ᴾ", 

59 "R": "ᴿ", 

60 "T": "ᵀ", 

61 "U": "ᵁ", 

62 "V": "ⱽ", 

63 "W": "ᵂ", 

64 "a": "ᵃ", 

65 "b": "ᵇ", 

66 "c": "ᶜ", 

67 "d": "ᵈ", 

68 "e": "ᵉ", 

69 "f": "ᶠ", 

70 "g": "ᵍ", 

71 "h": "ʰ", 

72 "i": "ⁱ", 

73 "j": "ʲ", 

74 "k": "ᵏ", 

75 "l": "ˡ", 

76 "m": "ᵐ", 

77 "n": "ⁿ", 

78 "o": "ᵒ", 

79 "p": "ᵖ", 

80 "r": "ʳ", 

81 "s": "ˢ", 

82 "t": "ᵗ", 

83 "u": "ᵘ", 

84 "v": "ᵛ", 

85 "w": "ʷ", 

86 "x": "ˣ", 

87 "y": "ʸ", 

88 "z": "ᶻ", 

89 "β": "ᵝ", 

90 "γ": "ᵞ", 

91 "δ": "ᵟ", 

92 "θ": "ᶿ", 

93 "ι": "ᶥ", 

94 "φ": "ᵠ", 

95 "χ": "ᵡ", 

96 "∞": "\u2002᪲", # This is a KLUDGE 

97} 

98 

99subscript_ht: dict[str, str] = { 

100 "0": "₀", 

101 "1": "₁", 

102 "2": "₂", 

103 "3": "₃", 

104 "4": "₄", 

105 "5": "₅", 

106 "6": "₆", 

107 "7": "₇", 

108 "8": "₈", 

109 "9": "₉", 

110 "+": "₊", 

111 "-": "₋", 

112 "−": "₋", 

113 "=": "₌", 

114 "(": "₍", 

115 ")": "₎", 

116 "a": "ₐ", 

117 "e": "ₑ", 

118 "h": "ₕ", 

119 "i": "ᵢ", 

120 "j": "ⱼ", 

121 "k": "ₖ", 

122 "l": "ₗ", 

123 "m": "ₘ", 

124 "n": "ₙ", 

125 "o": "ₒ", 

126 "p": "ₚ", 

127 "r": "ᵣ", 

128 "s": "ₛ", 

129 "t": "ₜ", 

130 "u": "ᵤ", 

131 "v": "ᵥ", 

132 "x": "ₓ", 

133 "ə": "ₔ", 

134 "ρ": "ᵨ", 

135 "φ": "ᵩ", 

136 "χ": "ᵪ", 

137} 

138 

139 

140def to_superscript(text: str) -> str: 

141 "Converts text to superscript." 

142 if not text: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 return "" 

144 if all(x in superscript_ht for x in text): 

145 return "".join(superscript_ht[x] for x in text) 

146 if len(text) == 1: 

147 return "^" + text 

148 return "^({})".format(text) 

149 

150 

151def to_subscript(text: str) -> str: 

152 """Converts text to subscript.""" 

153 if not text: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 return "" 

155 if all(x in subscript_ht for x in text): 155 ↛ 157line 155 didn't jump to line 157 because the condition on line 155 was always true

156 return "".join(subscript_ht[x] for x in text) 

157 if len(text) == 1: 

158 return "_" + text 

159 return "_({})".format(text) 

160 

161 

162def to_chem(text: str) -> str: 

163 """Converts text to chemical formula, making digits subscript.""" 

164 return "".join(to_subscript(x) if x.isdigit() else x for x in text) 

165 

166 

167# Mapping from Latex names to Unicode characters/strings. This is the 

168# default mapping (some cases are handled specially in the code). 

169math_map: dict[str, str] = { 

170 # XXX should probably change greek characters to non-slanted ones? 

171 "AC": "∿", 

172 "APLcomment": "⍝", 

173 "APLdownarrowbox": "⍗", 

174 "APLinput": "⍞", 

175 "APLinv": "⌹", 

176 "APLleftarrowbox": "⍇", 

177 "APLlog": "⍟", 

178 "APLrightarrowbox": "⍈", 

179 "APLuparrowbox": "⍐", 

180 "Angstroem": "Å", 

181 "Bot": "⫫", 

182 "Box": "□", 

183 "Bumpeq": "≎", 

184 "CIRCLE": "●", 

185 "Cap": "⋒", 

186 "CapitalDifferentialD": "ⅅ", 

187 "CheckedBox": "☑", 

188 "Circle": "○", 

189 "Coloneqq": "⩴", 

190 "ComplexI": "ⅈ", 

191 "ComplexJ": "ⅉ", 

192 "Cup": "⋓", 

193 "Delta": "Δ", 

194 "Diamond": "◇", 

195 "Diamondblack": "◆", 

196 "Diamonddot": "⟐", 

197 "DifferentialD": "ⅆ", 

198 "Digamma": "Ϝ", 

199 "Doteq": "≑", 

200 "DownArrowBar": "⤓", 

201 "DownLeftTeeVector": "⥞", 

202 "DownLeftVectorBar": "⥖", 

203 "DownRightTeeVector": "⥟", 

204 "DownRightVectorBar": "⥗", 

205 "Downarrow": "⇓", 

206 "Equal": "⩵", 

207 "Euler": "Ɛ", 

208 "ExponentialE": "ⅇ", 

209 "ExponetialE": "ⅇ", 

210 "Finv": "Ⅎ", 

211 "Gamma": "Γ", 

212 "Im": "ℑ", 

213 "Join": "⨝", 

214 "Koppa": "Ϟ", 

215 "LEFTCIRCLE": "◖", 

216 "LEFTcircle": "◐", 

217 "LHD": "◀", 

218 "LVec": "x⃖", 

219 "Lambda": "Λ", 

220 "Lbag": "⟅", 

221 "LeftArrowBar": "⇤", 

222 "LeftDownTeeVector": "⥡", 

223 "LeftDownVectorBar": "⥙", 

224 "LeftTeeVector": "⥚", 

225 "LeftTriangleBar": "⧏", 

226 "LeftUpTeeVector": "⥠", 

227 "LeftUpVectorBar": "⥘", 

228 "LeftVectorBar": "⥒", 

229 "Leftarrow": "⇐", 

230 "Leftrightarrow": "⇔", 

231 "Lleftarrow": "⇚", 

232 "Longleftarrow": "⟸", 

233 "Longleftrightarrow": "⟺", 

234 "Longmapsfrom": "⟽", 

235 "Longmapsto": "⟾", 

236 "Longrightarrow": "⟹", 

237 "Lparen": "⦅", 

238 "Lsh": "↰", 

239 "MapsDown": "↧", 

240 "MapsUp": "↥", 

241 "Mapsfrom": "⤆", 

242 "Mapsto": "⤇", 

243 "Micro": "µ", 

244 "Nearrow": "⇗", 

245 "NestedGreaterGreater": "⪢", 

246 "NestedLessLess": "⪡", 

247 "NotGreaterLess": "≹", 

248 "NotGreaterTilde": "≵", 

249 "NotLessTilde": "≴", 

250 "Nwarrow": "⇖", 

251 "Omega": "Ω", 

252 "Phi": "Φ", 

253 "Pi": "Π", 

254 "Proportion": "∷", 

255 "Psi": "Ψ", 

256 "Qoppa": "Ϙ", 

257 "RHD": "▶", 

258 "RIGHTCIRCLE": "◗", 

259 "RIGHTcircle": "◑", 

260 "Rbag": "⟆", 

261 "Re": "ℜ", 

262 "RightArrowBar": "⇥", 

263 "RightDownTeeVector": "⥝", 

264 "RightDownVectorBar": "⥕", 

265 "RightTeeVector": "⥛", 

266 "RightTriangleBar": "⧐", 

267 "RightUpTeeVector": "⥜", 

268 "RightUpVectorBar": "⥔", 

269 "RightVectorBar": "⥓", 

270 "Rightarrow": "⇒", 

271 "Rparen": "⦆", 

272 "Rrightarrow": "⇛", 

273 "Rsh": "↱", 

274 "S": "§", 

275 "Same": "⩶", 

276 "Sampi": "Ϡ", 

277 "Searrow": "⇘", 

278 "Sigma": "Σ", 

279 "Square": "☐", 

280 "Stigma": "Ϛ", 

281 "Subset": "⋐", 

282 "Sun": "☉", 

283 "Supset": "⋑", 

284 "Swarrow": "⇙", 

285 "Theta": "Θ", 

286 "Top": "⫪", 

287 "UpArrowBar": "⤒", 

288 "Uparrow": "⇑", 

289 "Updownarrow": "⇕", 

290 "Upsilon": "Υ", 

291 "VDash": "⊫", 

292 "VERT": "⦀", 

293 "Vdash": "⊩", 

294 "Vert": "‖", 

295 "Vvdash": "⊪", 

296 "XBox": "☒", 

297 "Xi": "Ξ", 

298 "Yup": "⅄", 

299 "_": "_", 

300 "aleph": "א", 

301 "alpha": "α", 

302 "amalg": "⨿", 

303 "anchor": "⚓", 

304 "angle": "∠", 

305 "approx": "≈", 

306 "approxeq": "≊", 

307 "aquarius": "♒", 

308 "arg": "arg", 

309 "aries": "♈", 

310 "arrowbullet": "➢", 

311 "ast": "∗", 

312 "asymp": "≍", 

313 "backepsilon": "϶", 

314 "backprime": "‵", 

315 "backsim": "∽", 

316 "backsimeq": "⋍", 

317 "backslash": "", 

318 "ballotx": "✗", 

319 "barin": "⋶", 

320 "barleftharpoon": "⥫", 

321 "barrightharpoon": "⥭", 

322 "barwedge": "⊼", 

323 "because": "∵", 

324 "beta": "β", 

325 "beth": "ב", 

326 "between": "≬", 

327 "bigcap": "∩", 

328 "bigcup": "∪", 

329 "biginterleave": "⫼", 

330 "bigodot": "⨀", 

331 "bigoplus": "⨁", 

332 "bigotimes": "⨂", 

333 "bigsqcap": "⨅", 

334 "bigsqcup": "⨆", 

335 "bigstar": "★", 

336 "bigtriangledown": "▽", 

337 "bigtriangleup": "△", 

338 "biguplus": "⨄", 

339 "bigvee": "∨", 

340 "bigwedge": "∧", 

341 "bij": "⤖", 

342 "biohazard": "☣", 

343 "blacklozenge": "⧫", 

344 "blacksmiley": "☻", 

345 "blacksquare": "■", 

346 "blacktriangledown": "▾", 

347 "blacktriangleleft": "◂", 

348 "blacktriangleright": "▸", 

349 "blacktriangleup": "▴", 

350 "bot": "⊥", 

351 "bowtie": "⋈", 

352 "boxast": "⧆", 

353 "boxbar": "◫", 

354 "boxbox": "⧈", 

355 "boxbslash": "⧅", 

356 "boxcircle": "⧇", 

357 "boxdot": "⊡", 

358 "boxminus": "⊟", 

359 "boxplus": "⊞", 

360 "boxslash": "⧄", 

361 "boxtimes": "⊠", 

362 "bullet": "•", 

363 "bumpeq": "≏", 

364 "cancer": "♋", 

365 "cap": "∩", 

366 "capricornus": "♑", 

367 "capwedge": "⩄", 

368 "cat": "⁀", 

369 "cdot": "·", 

370 "cdots": "⋯", 

371 "cent": "¢", 

372 "checkmark": "✓", 

373 "chi": "χ", 

374 "circ": "∘", 

375 "circeq": "≗", 

376 "circlearrowleft": "↺", 

377 "circlearrowright": "↻", 

378 "circledR": "®", 

379 "circledast": "⊛", 

380 "circledbslash": "⦸", 

381 "circledcirc": "⊚", 

382 "circleddash": "⊝", 

383 "circledgtr": "⧁", 

384 "circledless": "⧀", 

385 "clubsuit": "♣", 

386 "colon": ":", 

387 "coloneq": "≔", 

388 "complement": "∁", 

389 "cong": "≅", 

390 "coprod": "∐", 

391 "corresponds": "≙", 

392 "cup": "∪", 

393 "curlyeqprec": "⋞", 

394 "curlyeqsucc": "⋟", 

395 "curlyvee": "⋎", 

396 "curlywedge": "⋏", 

397 "curvearrowleft": "↶", 

398 "curvearrowright": "↷", 

399 "dagger": "†", 

400 "daleth": "ד", 

401 "dashleftarrow": "⇠", 

402 "dashrightarrow": "⇢", 

403 "dashv": "⊣", 

404 "ddagger": "‡", 

405 "delta": "δ", 

406 "diameter": "∅", 

407 "diamond": "⋄", 

408 "diamondsuit": "♢", 

409 "digamma": "ϝ", 

410 "div": "÷", 

411 "divideontimes": "⋇", 

412 "dlsh": "↲", 

413 "dot\\bigvee": "⩒", 

414 "dot\\cap": "⩀", 

415 "dot\\cup": "⊍", 

416 "dot\\lor": "⩒", 

417 "dot\\vee": "⩒", 

418 "doteq": "≐", 

419 "dotplus": "∔", 

420 "dots": "…", 

421 "doublebarwedge": "⩞", 

422 "downarrow": "↓", 

423 "downdownarrows": "⇊", 

424 "downdownharpoons": "⥥", 

425 "downharpoonleft": "⇃", 

426 "downharpoonright": "⇂", 

427 "downuparrows": "⇵", 

428 "downupharpoons": "⥯", 

429 "drsh": "↳", 

430 "dsub": "⩤", 

431 "earth": "♁", 

432 "eighthnote": "♪", 

433 "ell": "ℓ", 

434 "emptyset": "∅", 

435 "epsilon": "ϵ", 

436 "eqcirc": "≖", 

437 "eqcolon": "∹", 

438 "eqsim": "≂", 

439 "eqslantgtr": "⪖", 

440 "eqslantless": "⪕", 

441 "equiv": "≡", 

442 "eta": "η", 

443 "eth": "ð", 

444 "exists": "∃", 

445 "fallingdotseq": "≒", 

446 "fcmp": "⨾", 

447 "female": "♀", 

448 "ffun": "⇻", 

449 "finj": "⤕", 

450 "fint": "⨏", 

451 "flat": "♭", 

452 "footnotesize": "", 

453 "forall": "∀", 

454 "fourth": "⁗", 

455 "frown": "⌢", 

456 "frownie": "☹", 

457 "gamma": "γ", 

458 "ge": ">", 

459 "gemini": "♊", 

460 "geq": "≥", 

461 "geqq": "≧", 

462 "geqslant": "⩾", 

463 "gg": "≫", 

464 "ggcurly": "⪼", 

465 "ggg": "⋙", 

466 "gimel": "ג", 

467 "gnapprox": "⪊", 

468 "gneq": "⪈", 

469 "gneqq": "≩", 

470 "gnsim": "⋧", 

471 "gtrapprox": "⪆", 

472 "gtrdot": "⋗", 

473 "gtreqless": "⋛", 

474 "gtreqqless": "⪌", 

475 "gtrless": "≷", 

476 "gtrsim": "≳", 

477 "hash": "⋕", 

478 "heartsuit": "♡", 

479 "hookleftarrow": "↩", 

480 "hookrightarrow": "↪", 

481 "hslash": "ℏ", 

482 "iddots": "⋰", 

483 "iff": "⟺", 

484 "iiiint": "⨌", 

485 "iiint": "∭", 

486 "iint": "∬", 

487 "imath": "ı", 

488 "implies": "⟹", 

489 "in": "∈", 

490 "infty": "∞", 

491 "int": "∫", 

492 "intercal": "⊺", 

493 "interleave": "⫴", 

494 "invamp": "⅋", 

495 "invdiameter": "⍉", 

496 "invneg": "⌐", 

497 "iota": "ι", 

498 "jmath": "ȷ", 

499 "jupiter": "♃", 

500 "kappa": "κ", 

501 "koppa": "ϟ", 

502 "lambda": "λ", 

503 "land": "∧", 

504 "lang": "⟪", 

505 "langle": "⟨", 

506 "large": "", 

507 "lblot": "⦉", 

508 "lbrace": "{", 

509 "lbrack": "[", 

510 "lceil": "⌈", 

511 "ldots": "…", 

512 "le": "<", 

513 "leadsto": "⤳", 

514 "leftarrow": "←", 

515 "leftarrowtail": "↢", 

516 "leftarrowtriangle": "⇽", 

517 "leftbarharpoon": "⥪", 

518 "leftharpoondown": "↽", 

519 "leftharpoonup": "↼", 

520 "leftleftarrows": "⇇", 

521 "leftleftharpoons": "⥢", 

522 "leftmoon": "☾", 

523 "leftrightarrow": "↔", 

524 "leftrightarrows": "⇆", 

525 "leftrightarrowtriangle": "⇿", 

526 "leftrightharpoon": "⥊", 

527 "leftrightharpoondown": "⥐", 

528 "leftrightharpoons": "⇋", 

529 "leftrightharpoonup": "⥎", 

530 "leftrightsquigarrow": "↭", 

531 "leftslice": "⪦", 

532 "leftsquigarrow": "⇜", 

533 "leftthreetimes": "⋋", 

534 "leftupdownharpoon": "⥑", 

535 "leo": "♌", 

536 "leq": "≤", 

537 "leqq": "≦", 

538 "leqslant": "⩽", 

539 "lessapprox": "⪅", 

540 "lessdot": "⋖", 

541 "lesseqgtr": "⋚", 

542 "lesseqqgtr": "⪋", 

543 "lessgtr": "≶", 

544 "lessim": "≲", 

545 "lesssim": "≲", 

546 "lfloor": "⌊", 

547 "lgroup": "⟮", 

548 "lhd": "◁", 

549 "libra": "♎", 

550 "lightning": "↯", 

551 "limg": "⦇", 

552 "ll": "≪", 

553 "llbracket": "⟦", 

554 "llcorner": "⌞", 

555 "llcurly": "⪻", 

556 "lll": "⋘", 

557 "lnapprox": "⪉", 

558 "lneq": "⪇", 

559 "lneqq": "≨", 

560 "lnot": "¬", 

561 "lnsim": "⋦", 

562 "longleftarrow": "⟵", 

563 "longleftrightarrow": "⟷", 

564 "longmapsfrom": "⟻", 

565 "longmapsto": "⟼", 

566 "longrightarrow": "⟶", 

567 "looparrowleft": "↫", 

568 "looparrowright": "↬", 

569 "lor": "∨", 

570 "lozenge": "◊", 

571 "lrcorner": "⌟", 

572 "ltimes": "⋉", 

573 "male": "♂", 

574 "maltese": "✠", 

575 "mapsfrom": "↤", 

576 "mapsto": "↦", 

577 "measuredangle": "∡", 

578 "medbullet": "⚫", 

579 "medcirc": "⚪", 

580 "mercury": "☿", 

581 "mho": "℧", 

582 "mid": "∣", 

583 "mlcp": "⫛", 

584 "mod": " mod ", 

585 "models": "⊧", 

586 "mp": "∓", 

587 "mu": "μ", 

588 "multimap": "⊸", 

589 "multimapboth": "⧟", 

590 "multimapdotbothA": "⊶", 

591 "multimapdotbothB": "⊷", 

592 "multimapinv": "⟜", 

593 "nLeftarrow": "⇍", 

594 "nLeftrightarrow": "⇎", 

595 "nRightarrow": "⇏", 

596 "nVDash": "⊯", 

597 "nVdash": "⊮", 

598 "nabla": "∇", 

599 "napprox": "≉", 

600 "natural": "♮", 

601 "ncong": "≇", 

602 "nearrow": "↗", 

603 "neg": "¬", 

604 "neptune": "♆", 

605 "neq": "≠", 

606 "nequiv": "≢", 

607 "nexists": "∄", 

608 "ngeq": "≱", 

609 "ngtr": "≯", 

610 "ni": "∋", 

611 "nleftarrow": "↚", 

612 "nleftrightarrow": "↮", 

613 "nleq": "≰", 

614 "nless": "≮", 

615 "nmid": "∤", 

616 "nni": "∌", 

617 "normalsize": "", 

618 "not\\in": "∉", 

619 "not\\ni": "∌", 

620 "not\\preceq": "⋠", 

621 "not\\subset": "⊄", 

622 "not\\subseteq": "⊈", 

623 "not\\succeq": "⋡", 

624 "not\\supset": "⊅", 

625 "not\\supseteq": "⊉", 

626 "not\\trianglelefteq": "⋬", 

627 "not\\trianglerighteq": "⋭", 

628 "not\\vartriangleleft": "⋪", 

629 "not\\vartriangleright": "⋫", 

630 "notasymp": "≭", 

631 "notbackslash": "⍀", 

632 "notin": "∉", 

633 "notslash": "⌿", 

634 "nparallel": "∦", 

635 "nprec": "⊀", 

636 "npreceq": "⋠", 

637 "nrightarrow": "↛", 

638 "nsim": "≁", 

639 "nsimeq": "≄", 

640 "nsqsubseteq": "⋢", 

641 "nsqsupseteq": "⋣", 

642 "nsubset": "⊄", 

643 "nsubseteq": "⊈", 

644 "nsucc": "⊁", 

645 "nsucceq": "⋡", 

646 "nsupset": "⊅", 

647 "nsupseteq": "⊉", 

648 "ntriangleleft": "⋪", 

649 "ntrianglelefteq": "⋬", 

650 "ntriangleright": "⋫", 

651 "ntrianglerighteq": "⋭", 

652 "nu": "ν", 

653 "nvDash": "⊭", 

654 "nvdash": "⊬", 

655 "nwarrow": "↖", 

656 "odot": "⊙", 

657 "oiiint": "∰", 

658 "oiint": "∯", 

659 "oint": "∮", 

660 "ointctrclockwise": "∳", 

661 "omega": "ω", 

662 "ominus": "⊖", 

663 "oplus": "⊕", 

664 "oslash": "⊘", 

665 "otimes": "⊗", 

666 "over": "/", 

667 "overbrace": "⏞", 

668 "overleftrightarrow": "x⃡", 

669 "overparen": "⏜", 

670 "overset?=": "≟", 

671 "overset{?}{=}": "≟", 

672 "overset{\\operatorname{def}}{=}": "≝", 

673 "parallel": "∥", 

674 "partial": "∂", 

675 "pencil": "✎", 

676 "perp": "⊥", 

677 "pfun": "⇸", 

678 "phi": "ϕ", 

679 "pi": "π", 

680 "pinj": "⤔", 

681 "pisces": "♓", 

682 "pitchfork": "⋔", 

683 "pluto": "♇", 

684 "pm": "±", 

685 "pointright": "☞", 

686 "pounds": "£", 

687 "prec": "≺", 

688 "precapprox": "⪷", 

689 "preccurlyeq": "≼", 

690 "preceq": "⪯", 

691 "preceqq": "⪳", 

692 "precnapprox": "⪹", 

693 "precnsim": "⋨", 

694 "precsim": "≾", 

695 "prime": "′", 

696 "prod": "∏", 

697 "propto": "∝", 

698 "psi": "ψ", 

699 "psur": "⤀", 

700 "qoppa": "ϙ", 

701 "quad": " ", 

702 "quarternote": "♩", 

703 "radiation": "☢", 

704 "rang": "⟫", 

705 "rangle": "⟩", 

706 "rarr": "→", 

707 "rblot": "⦊", 

708 "rbrace": "}", 

709 "rbrack": "]", 

710 "rceil": "⌉", 

711 "recycle": "♻", 

712 "rfloor": "⌋", 

713 "rgroup": "⟯", 

714 "rhd": "▷", 

715 "rho": "ρ", 

716 "rightangle": "∟", 

717 "rightarrow": "→", 

718 "rightarrowtail": "↣", 

719 "rightarrowtriangle": "⇾", 

720 "rightbarharpoon": "⥬", 

721 "rightharpoondown": "⇁", 

722 "rightharpoonup": "⇀", 

723 "rightleftarrows": "⇄", 

724 "rightleftharpoon": "⥋", 

725 "rightleftharpoons": "⇌", 

726 "rightmoon": "☽", 

727 "rightrightarrows": "⇉", 

728 "rightrightharpoons": "⥤", 

729 "rightslice": "⪧", 

730 "rightsquigarrow": "⇝", 

731 "rightthreetimes": "⋌", 

732 "rightupdownharpoon": "⥏", 

733 "rimg": "⦈", 

734 "risingdotseq": "≓", 

735 "rrbracket": "⟧", 

736 "rsub": "⩥", 

737 "rtimes": "⋊", 

738 "sagittarius": "♐", 

739 "sampi": "ϡ", 

740 "saturn": "♄", 

741 "scorpio": "♏", 

742 "scriptsize": "", 

743 "searrow": "↘", 

744 "second": "″", 

745 "setminus": "⧵", 

746 "sharp": "♯", 

747 "sigma": "σ", 

748 "sim": "∼", 

749 "simeq": "≃", 

750 "sixteenthnote": "♬", 

751 "skull": "☠", 

752 "slash": "∕", 

753 "small": "", 

754 "smallsetminus": "∖", 

755 "smalltriangledown": "▿", 

756 "smalltriangleleft": "◃", 

757 "smalltriangleright": "▹", 

758 "smalltriangleup": "▵", 

759 "smile": "⌣", 

760 "smiley": "☺", 

761 "spadesuit": "♠", 

762 "spddot": "¨", 

763 "sphat": "^", 

764 "sphericalangle": "∢", 

765 "spot": "⦁", 

766 "sptilde": "~", 

767 "sqcap": "⊓", 

768 "sqcup": "⊔", 

769 "sqint": "⨖", 

770 "sqrt": "√", # ∛ ∜ - partly special handling below 

771 "sqrt[3]": "∛", 

772 "sqrt[4]": "∜", 

773 "sqsubset": "⊏", 

774 "sqsubseteq": "⊑", 

775 "sqsupset": "⊐", 

776 "sqsupseteq": "⊒", 

777 "square": "□", 

778 "sslash": "⫽", 

779 "star": "⋆", 

780 "steaming": "☕", 

781 "stigma": "ϛ", 

782 "strictfi": "⥼", 

783 "strictif": "⥽", 

784 "subset": "⊂", 

785 "subseteq": "⊆", 

786 "subseteqq": "⫅", 

787 "subsetneq": "⊊", 

788 "subsetneqq": "⫋", 

789 "succ": "≻", 

790 "succapprox": "⪸", 

791 "succcurlyeq": "≽", 

792 "succeq": "⪰", 

793 "succeqq": "⪴", 

794 "succnapprox": "⪺", 

795 "succnsim": "⋩", 

796 "succsim": "≿", 

797 "sum": "∑", 

798 "sun": "☼", 

799 "supset": "⊃", 

800 "supseteq": "⊇", 

801 "supseteqq": "⫆", 

802 "supsetneq": "⊋", 

803 "supsetneqq": "⫌", 

804 "swarrow": "↙", 

805 "swords": "⚔", 

806 "talloblong": "⫾", 

807 "tau": "τ", 

808 "taurus": "♉", 

809 "tcohm": "Ω", 

810 "textbackslash": "\\", 

811 "textbar": "|", 

812 "textbullet": "•", 

813 "textgreater": ">", 

814 "textless": "<", 

815 "textprime": "′", 

816 "therefore": "∴", 

817 "theta": "θ", 

818 "third": "‴", 

819 "times": "×", 

820 "tiny": "", 

821 "to": "→", 

822 "top": "⊤", 

823 "triangle": "∆", 

824 "trianglelefteq": "⊴", 

825 "triangleq": "≜", 

826 "trianglerighteq": "⊵", 

827 "twoheadleftarrow": "↞", 

828 "twoheadrightarrow": "↠", 

829 "twonotes": "♫", 

830 "ulcorner": "⌜", 

831 "underbar": " ̱", 

832 "underbrace": "⏟", 

833 "underleftarrow": "x⃮", 

834 "underline": " ̲", 

835 "underparen": "⏝", 

836 "underrightarrow": "x⃯", 

837 "uparrow": "↑", 

838 "updownarrow": "↕", 

839 "updownarrows": "⇅", 

840 "updownharpoons": "⥮", 

841 "upharpoonleft": "↿", 

842 "upharpoonright": "↾", 

843 "uplus": "⊎", 

844 "upsilon": "υ", 

845 "upuparrows": "⇈", 

846 "upupharpoons": "⥣", 

847 "uranus": "♅", 

848 "urcorner": "⌝", 

849 "utilde": " ̰", 

850 "vDash": "⊨", 

851 "varbeta": "β", 

852 "varclubsuit": "♧", 

853 "vardiamondsuit": "♦", 

854 "varepsilon": "ε", 

855 "varheartsuit": "♥", 

856 "varkappa": "ϰ", 

857 "varnothing": "∅", 

858 "varointclockwise": "∲", 

859 "varphi": "φ", 

860 "varpi": "ϖ", 

861 "varprod": "⨉", 

862 "varrho": "ϱ", 

863 "varsigma": "ς", 

864 "varspadesuit": "♤", 

865 "vartheta": "θ", 

866 "vartriangleleft": "⊲", 

867 "vartriangleright": "⊳", 

868 "vdash": "⊢", 

869 "vdots": "⋮", 

870 "vee": "∨", 

871 "veebar": "⊻", 

872 "vert": "|", 

873 "virgo": "♍", 

874 "warning": "⚠", 

875 "wasylozenge": "⌑", 

876 "wedge": "∧", 

877 "widehat=": "≙", 

878 "widehat{=}": "≙", 

879 "wp": "℘", 

880 "wr": "≀", 

881 "xi": "ξ", 

882 "yen": "¥", 

883 "yinyang": "☯", 

884 "zcmp": "⨟", 

885 "zeta": "ζ", 

886 "zhide": "⧹", 

887 "zpipe": "⨠", 

888 "zproject": "⨡", 

889 "|": "‖", 

890 # Accents XXX these really should be handled specially with diacritics 

891 # after argument 

892 "acute": "́", 

893 "bar": "̄", 

894 "breve": "̆", 

895 "check": "̌", 

896 "ddddot": "⃜", 

897 "dddot": "⃛", 

898 "ddot": "̈", 

899 "ddots": "⋱", 

900 "dot": "̇", 

901 "grave": "̀", 

902 "hat": "̂", 

903 "lvec": "⃐", 

904 "mathring": "̊", 

905 "not": "̸", 

906 "overline": "◌̅", 

907 "tilde": "̃", 

908 "vec": "⃑", 

909 # Some ignored operators 

910 "bigl": "", 

911 "bigr": "", 

912 "left": "", 

913 "right": "", 

914 "style": "", 

915 "textstyle": "", 

916 "mathrm": "", 

917} 

918 

919mathcal_map: dict[str, str] = { 

920 "A": "𝒜", 

921 "B": "ℬ", 

922 "C": "𝒞", 

923 "D": "𝒟", 

924 "E": "ℰ", 

925 "F": "ℱ", 

926 "G": "𝒢", 

927 "H": "ℋ", 

928 "I": "ℐ", 

929 "J": "𝒥", 

930 "K": "𝒦", 

931 "L": "ℒ", 

932 "M": "ℳ", 

933 "N": "𝒩", 

934 "O": "𝒪", 

935 "P": "𝒫", 

936 "Q": "𝒬", 

937 "R": "ℛ", 

938 "S": "𝒮", 

939 "T": "𝒯", 

940 "U": "𝒰", 

941 "V": "𝒱", 

942 "W": "𝒲", 

943 "X": "𝒳", 

944 "Y": "𝒴", 

945 "Z": "𝒵", 

946 "a": "𝒶", 

947 "b": "𝒷", 

948 "c": "𝒸", 

949 "d": "𝒹", 

950 "e": "ℯ", 

951 "f": "𝒻", 

952 "g": "ℊ", 

953 "h": "𝒽", 

954 "i": "𝒾", 

955 "j": "𝒿", 

956 "k": "𝓀", 

957 "l": "𝓁", 

958 "m": "𝓂", 

959 "n": "𝓃", 

960 "o": "ℴ", 

961 "p": "𝓅", 

962 "q": "𝓆", 

963 "r": "𝓇", 

964 "s": "𝓈", 

965 "t": "𝓉", 

966 "u": "𝓊", 

967 "v": "𝓋", 

968 "w": "𝓌", 

969 "x": "𝓍", 

970 "y": "𝓎", 

971 "z": "𝓏", 

972} 

973 

974mathfrak_map: dict[str, str] = { 

975 "A": "𝔄", 

976 "B": "𝔅", 

977 "C": "ℭ", 

978 "D": "𝔇", 

979 "E": "𝔈", 

980 "F": "𝔉", 

981 "G": "𝔊", 

982 "H": "ℌ", 

983 "J": "𝔍", 

984 "K": "𝔎", 

985 "L": "𝔏", 

986 "M": "𝔐", 

987 "N": "𝔑", 

988 "O": "𝔒", 

989 "P": "𝔓", 

990 "Q": "𝔔", 

991 "S": "𝔖", 

992 "T": "𝔗", 

993 "U": "𝔘", 

994 "V": "𝔙", 

995 "W": "𝔚", 

996 "X": "𝔛", 

997 "Y": "𝔜", 

998 "Z": "ℨ", 

999} 

1000 

1001mathbb_map: dict[str, str] = { 

1002 "A": "𝔸", 

1003 "B": "𝔹", 

1004 "C": "ℂ", 

1005 "D": "𝔻", 

1006 "E": "𝔼", 

1007 "F": "𝔽", 

1008 "G": "𝔾", 

1009 "H": "ℍ", 

1010 "I": "𝕀", 

1011 "J": "𝕁", 

1012 "K": "𝕂", 

1013 "L": "𝕃", 

1014 "M": "𝕄", 

1015 "N": "ℕ", 

1016 "O": "𝕆", 

1017 "P": "ℙ", 

1018 "Q": "ℚ", 

1019 "R": "ℝ", 

1020 "S": "𝕊", 

1021 "T": "𝕋", 

1022 "U": "𝕌", 

1023 "V": "𝕍", 

1024 "W": "𝕎", 

1025 "X": "𝕏", 

1026 "Y": "𝕐", 

1027 "Z": "ℤ", 

1028 "a": "𝕒", 

1029 "b": "𝕓", 

1030 "c": "𝕔", 

1031 "d": "𝕕", 

1032 "e": "𝕖", 

1033 "f": "𝕗", 

1034 "g": "𝕘", 

1035 "h": "𝕙", 

1036 "i": "𝕚", 

1037 "j": "𝕛", 

1038 "k": "𝕜", 

1039 "l": "𝕝", 

1040 "m": "𝕞", 

1041 "n": "𝕟", 

1042 "o": "𝕠", 

1043 "p": "𝕡", 

1044 "q": "𝕢", 

1045 "r": "𝕣", 

1046 "s": "𝕤", 

1047 "t": "𝕥", 

1048 "u": "𝕦", 

1049 "v": "𝕧", 

1050 "w": "𝕨", 

1051 "x": "𝕩", 

1052 "y": "𝕪", 

1053 "z": "𝕫", 

1054 "pi": "ℼ", 

1055 "gamma": "ℽ", 

1056 "Gamma": "ℾ", 

1057 "Pi": "ℿ", 

1058 "Sigma": "⅀", 

1059 "0": "𝟘", 

1060 "1": "𝟙", 

1061 "2": "𝟚", 

1062 "3": "𝟛", 

1063 "4": "𝟜", 

1064 "5": "𝟝", 

1065 "6": "𝟞", 

1066 "7": "𝟟", 

1067 "8": "𝟠", 

1068 "9": "𝟡", 

1069} 

1070 

1071 

1072def mathcal_fn(text: str) -> str: 

1073 return "".join(mathcal_map.get(x, x) for x in text) 

1074 

1075 

1076def mathfrak_fn(text: str) -> str: 

1077 return "".join(mathfrak_map.get(x, x) for x in text) 

1078 

1079 

1080def mathbb_fn(text: str) -> str: 

1081 return "".join(mathbb_map.get(x, x) for x in text) 

1082 

1083 

1084def to_math(text: str) -> str: 

1085 """Converts a mathematical formula to ASCII.""" 

1086 # print("to_math: {!r}".format(text)) 

1087 magic_vec: list[str] = [] 

1088 

1089 def expand(text: str) -> str: 

1090 while True: 

1091 orig = text 

1092 # formatting with {:c} converts input into character 

1093 text = re.sub( 

1094 r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST), 

1095 lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST], 

1096 text, 

1097 ) 

1098 if text == orig: 

1099 break 

1100 return text 

1101 

1102 def recurse(text: str) -> str: 

1103 def math_magic( 

1104 text: str, left: str, right: str, fn: Callable[[str], str] 

1105 ) -> str: 

1106 regexp_str = r"{}([^{}{}]+){}".format( 

1107 re.escape(left), 

1108 re.escape(left), 

1109 re.escape(right), 

1110 re.escape(right), 

1111 ) 

1112 regexp = re.compile(regexp_str) 

1113 

1114 def repl(m: re.Match) -> str: 

1115 magic = chr(MAGIC_FIRST + len(magic_vec)) 

1116 t = fn(m.group(1)).strip() 

1117 magic_vec.append(t) 

1118 return magic 

1119 

1120 while True: 

1121 orig = text 

1122 text = re.sub(regexp, repl, text) 

1123 if text == orig: 

1124 break 

1125 return text 

1126 

1127 def expand_group(v: str) -> str: 

1128 fn: Optional[Callable[[str], str]] = None 

1129 if re.match(r"\\mathcal\b", v): 

1130 fn = mathcal_fn 

1131 v = v[8:].strip() 

1132 elif re.match(r"\\mathfrak\b", v): 

1133 fn = mathfrak_fn 

1134 v = v[9:].strip() 

1135 elif re.match(r"\\mathbb\b", v): 

1136 fn = mathbb_fn 

1137 v = v[7:] 

1138 elif re.match(r"\\(begin|end)\b", v): 1138 ↛ 1139line 1138 didn't jump to line 1139 because the condition on line 1138 was never true

1139 v = "" # Skip 

1140 elif re.match(r"\\text\b", v): 1140 ↛ 1141line 1140 didn't jump to line 1141 because the condition on line 1140 was never true

1141 v = v[5:] 

1142 elif re.match(r"\\pmod\b", v): 1142 ↛ 1143line 1142 didn't jump to line 1143 because the condition on line 1142 was never true

1143 v = v[5:].strip() 

1144 v = "(mod " + expand_group(v) + ")" 

1145 elif re.match(r"\\sqrt\[", v): 1145 ↛ 1146line 1145 didn't jump to line 1146 because the condition on line 1145 was never true

1146 a = v[6:-1].strip() 

1147 if a == "2": 

1148 v = "√" 

1149 elif a == "3": 

1150 v = "∛" 

1151 elif a == "4": 

1152 v = "∜" 

1153 else: 

1154 v = to_superscript(a) + "√" 

1155 elif re.match(r"\\sqrt($|[0-9]|\b)", v): 1155 ↛ 1156line 1155 didn't jump to line 1156 because the condition on line 1155 was never true

1156 v = "√" 

1157 elif re.match(r"\\(frac|binom)($|[0-9]|\b)", v): 

1158 m = re.match( 

1159 r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*" 

1160 r"(\\[a-zA-Z]+|\\.|.)$", 

1161 v, 

1162 ) 

1163 if not m: 1163 ↛ 1164line 1163 didn't jump to line 1164 because the condition on line 1163 was never true

1164 print("MATH FRAC/BINOM ERROR: {!r}".format(v)) 

1165 return v 

1166 op, a, b = m.groups() 

1167 a = expand_group(a).strip() 

1168 b = expand_group(b).strip() 

1169 if len(a) > 1: 

1170 a = "(" + a + ")" 

1171 if len(b) > 1: 

1172 b = "(" + b + ")" 

1173 if op == "frac": 1173 ↛ 1175line 1173 didn't jump to line 1175 because the condition on line 1173 was always true

1174 v = a + "/" + b 

1175 elif op == "binom": 

1176 v = "binom({}, {})".format(a, b) 

1177 else: 

1178 # Should never get here 

1179 v = "{}({})".format(op, v) 

1180 elif v.startswith("_"): 

1181 fn = to_subscript 

1182 v = v[1:] 

1183 elif v.startswith("^"): 

1184 fn = to_superscript 

1185 v = v[1:] 

1186 if v.startswith("\\"): 

1187 mapped = math_map.get(v[1:].strip()) 

1188 if mapped is None: 1188 ↛ 1189line 1188 didn't jump to line 1189 because the condition on line 1188 was never true

1189 if v[1:].strip().isalnum(): 

1190 v = " " + v[1:].strip() + " " 

1191 else: 

1192 v = v[1:].strip() 

1193 else: 

1194 v = mapped 

1195 elif v.isspace() or v in ("&",): # Ignore certain special chars 1195 ↛ 1196line 1195 didn't jump to line 1196 because the condition on line 1195 was never true

1196 v = "" 

1197 if fn is not None: 

1198 v = expand(v) 

1199 v = fn(v) 

1200 v = expand(v) 

1201 return v 

1202 

1203 parts: list[str] = [] 

1204 while True: 

1205 orig = text 

1206 text = math_magic(text, "{", "}", recurse) 

1207 if text == orig: 

1208 break 

1209 for m in re.finditer( 

1210 r"\s+|" 

1211 r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*" 

1212 r"(\\dot\\(bigvee|cup|cap|lor|vee)|" 

1213 r"\\not\\(subset|supset|subseteq|supseteq|in|ni|" 

1214 r"preceq|succeq|vartrianglelefteq|" 

1215 r"vartrianglerighteq|trianglelefteq|" 

1216 r"trianglerighteq)|" 

1217 r"\\widehat\{=\}|\\widehat=|" 

1218 r"\\overset\{?\}\{=\}|" 

1219 r"\\overset\?=|" 

1220 r"\\overset\{\\operatorname\{def\}\}\{=\}|" 

1221 r"\\[a-zA-Z]+|\\.|.)|" 

1222 r"(\\(mathcal|mathfrak|mathbb|text|begin|end|pmod)" 

1223 r"\b\s*|" 

1224 r"\\sqrt\b(\[\d+\])?)?" 

1225 r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)", 

1226 text, 

1227 ): 

1228 v = m.group(0).strip() 

1229 if not v: 

1230 continue 

1231 v = expand_group(v) 

1232 if v: 1232 ↛ 1209line 1232 didn't jump to line 1209 because the condition on line 1232 was always true

1233 if ( 

1234 parts and parts[-1][-1].isalpha() and v[0] in "0123456789" 

1235 ) or ( 

1236 parts 

1237 and parts[-1][-1] in "0123456789" 

1238 and v[0] in "0123456789" 

1239 ): 

1240 v = " " + v 

1241 parts.append(v) 

1242 

1243 text = "".join(parts) 

1244 return text 

1245 

1246 text = recurse(text) 

1247 # print("math text final: {!r}".format(text)) 

1248 return text 

1249 

1250 

1251def bold_follows(parts: list[str], i: int) -> bool: 

1252 """Checks if there is a bold (''') in parts after parts[i]. We allow 

1253 intervening italics ('').""" 

1254 parts = parts[i + 1 :] 

1255 for p in parts: 1255 ↛ 1260line 1255 didn't jump to line 1260 because the loop on line 1255 didn't complete

1256 if not p.startswith("''"): 

1257 continue 

1258 if p.startswith("'''"): 1258 ↛ 1255line 1258 didn't jump to line 1255 because the condition on line 1258 was always true

1259 return True 

1260 return False 

1261 

1262 

1263def remove_italic_and_bold(text: str) -> str: 

1264 """Based on token_iter in wikitextprocessor""" 

1265 assert isinstance(text, str) 

1266 lines = re.split(r"(\n+)", text) # Lines and separators 

1267 parts_re = re.compile(r"(''+)") 

1268 new_text_parts = [] 

1269 for line in lines: 

1270 parts = re.split(parts_re, line) 

1271 state = 0 # 1=in italic 2=in bold 3=in both 

1272 for i, part in enumerate(parts): 

1273 if part.startswith("''"): 

1274 # This is a bold/italic part. Scan the rest of the line 

1275 # to determine how it should be interpreted if there are 

1276 # more than two apostrophes. 

1277 if part.startswith("'''''"): 

1278 if state == 1: # in italic 1278 ↛ 1279line 1278 didn't jump to line 1279 because the condition on line 1278 was never true

1279 part = part[5:] 

1280 state = 2 

1281 elif state == 2: # in bold 1281 ↛ 1282line 1281 didn't jump to line 1282 because the condition on line 1281 was never true

1282 part = part[5:] 

1283 state = 1 

1284 elif state == 3: # in both 

1285 state = 0 

1286 part = part[5:] 

1287 else: # in nothing 

1288 part = part[5:] 

1289 state = 3 

1290 elif part.startswith("'''"): 

1291 if state == 1: # in italic 

1292 if bold_follows(parts, i): 1292 ↛ 1296line 1292 didn't jump to line 1296 because the condition on line 1292 was always true

1293 part = part[3:] 

1294 state = 3 

1295 else: 

1296 part = part[2:] 

1297 state = 0 

1298 elif state == 2: # in bold 

1299 part = part[3:] 

1300 state = 0 

1301 elif state == 3: # in both 

1302 part = part[3:] 

1303 state = 1 

1304 else: # in nothing 

1305 part = part[3:] 

1306 state = 2 

1307 elif part.startswith("''"): 1307 ↛ 1320line 1307 didn't jump to line 1320 because the condition on line 1307 was always true

1308 if state == 1: # in italic 

1309 part = part[2:] 

1310 state = 0 

1311 elif state == 2: # in bold 1311 ↛ 1312line 1311 didn't jump to line 1312 because the condition on line 1311 was never true

1312 part = part[2:] 

1313 state = 3 

1314 elif state == 3: # in both 1314 ↛ 1315line 1314 didn't jump to line 1315 because the condition on line 1314 was never true

1315 part = part[2:] 

1316 state = 2 

1317 else: # in nothing 

1318 part = part[2:] 

1319 state = 1 

1320 if part: 1320 ↛ 1321line 1320 didn't jump to line 1321 because the condition on line 1320 was never true

1321 new_text_parts.append(part) 

1322 continue 

1323 new_text_parts.append(part) 

1324 new_text_parts.append("\n") 

1325 new_text_parts = new_text_parts[:-1] # remove last \n 

1326 return "".join(new_text_parts) 

1327 

1328 

1329# regex to find File/Image link attributes that would mean an image 

1330# is *not* inline 

1331NOT_INLINE_IMG_RE = re.compile(r"\|\s*(right|left|center|thumb|frame)\s*\|") 

1332 

1333 

1334URL_STARTS_RE = re.compile( 

1335 r"({})".format(r"|".join(URL_STARTS)), flags=re.IGNORECASE 

1336) 

1337 

1338IMAGE_LINK_RE: Optional[re.Pattern] = None 

1339 

1340 

1341def clean_value( 

1342 wxr: WiktextractContext, title: str, no_strip=False, no_html_strip=False 

1343) -> str: 

1344 """Cleans a title or value into a normal string. This should basically 

1345 remove any Wikimedia formatting from it: HTML tags, templates, links, 

1346 emphasis, etc. This will also merge multiple whitespaces into one 

1347 normal space and will remove any surrounding whitespace.""" 

1348 assert isinstance(wxr, WiktextractContext) 

1349 assert isinstance(title, str) 

1350 

1351 global IMAGE_LINK_RE 

1352 if IMAGE_LINK_RE is None: 

1353 image_link_prefixes = wxr.wtp.namespace_prefixes( 

1354 wxr.wtp.NAMESPACE_DATA["File"]["id"], suffix="" 

1355 ) 

1356 IMAGE_LINK_RE = re.compile( 

1357 rf"(?:{'|'.join(image_link_prefixes)})\s*:", re.IGNORECASE 

1358 ) 

1359 

1360 def repl_1(m: re.Match) -> str: 

1361 return clean_value(wxr, m.group(1), no_strip=True) 

1362 

1363 def repl_exturl(m: re.Match) -> str: 

1364 args = re.split(r"\s+", m.group(1)) 

1365 i = 0 

1366 while i < len(args) - 1: 

1367 if not URL_STARTS_RE.match(args[i]): 

1368 break 

1369 i += 1 

1370 return " ".join(args[i:]) 

1371 

1372 def repl_link(m: re.Match) -> str: 

1373 before_colon = m.group(1) 

1374 after_colon = m.group(3) 

1375 if ( 

1376 before_colon is not None 

1377 and IMAGE_LINK_RE.match(before_colon) is not None 

1378 ): 

1379 return "" 

1380 if before_colon is not None and before_colon.strip(": ") in ("w", "s"): 

1381 # Wikipedia or Wikisource link 

1382 v = after_colon.split("|")[0] 

1383 else: 

1384 v = m.group(0).strip("[] ").split("|")[0] 

1385 return clean_value(wxr, v, no_strip=True) 

1386 

1387 def repl_link_bars(m: re.Match) -> str: 

1388 link = m.group(1) 

1389 if IMAGE_LINK_RE.match(link) is not None: 

1390 # Handle File / Image / Fichier 'links' here. 

1391 if NOT_INLINE_IMG_RE.match(m.group(0)) is None and "alt" in m.group( 

1392 0 

1393 ): 

1394 # This image should be inline, so let's print its alt text 

1395 alt_m = re.search(r"\|\s*alt\s*=([^]|]+)(\||\]\])", m.group(0)) 

1396 if alt_m is not None: 1396 ↛ 1398line 1396 didn't jump to line 1398 because the condition on line 1396 was always true

1397 return "[Alt: " + alt_m.group(1) + "]" 

1398 return "" 

1399 # m.group(5) is always the last matching group because you can 

1400 # only access the last matched group; the indexes don't 'grow' 

1401 return clean_value(wxr, m.group(5) or m.group(2) or "", no_strip=True) 

1402 

1403 def repl_1_sup(m: re.Match) -> str: 

1404 return to_superscript(clean_value(wxr, m.group(1))) 

1405 

1406 def repl_1_sub(m: re.Match) -> str: 

1407 return to_subscript(clean_value(wxr, m.group(1))) 

1408 

1409 def repl_1_chem(m: re.Match) -> str: 

1410 return to_chem(clean_value(wxr, m.group(1))) 

1411 

1412 def repl_1_math(m: re.Match) -> str: 

1413 v = to_math(m.group(1)) 

1414 # print("to_math:", ascii(v)) 

1415 return v 

1416 

1417 def repl_1_syntaxhighlight(m: re.Match) -> str: 

1418 # Content is preformatted 

1419 return "\n" + m.group(1).strip() + "\n" 

1420 

1421 # remove nowiki tag returned from `Wtp.node_to_html()` 

1422 title = re.sub(r"<nowiki\s*/>", "", title) 

1423 

1424 # Remove any remaining templates 

1425 # title = re.sub(r"\{\{[^}]+\}\}", "", title) 

1426 

1427 # Remove tables, which can contain other tables 

1428 prev = "" 

1429 while title != prev: 

1430 prev = title 

1431 title = re.sub( 

1432 r"\{\|((?!\{\|)(?!\|\}).)*\|\}", 

1433 "\n", 

1434 title, 

1435 flags=re.DOTALL, 

1436 ) 

1437 # title = re.sub(r"(?s)\{\|.*?\|\}", "\n", title) 

1438 # Remove second reference tags (<ref name="ref_name"/>) 

1439 title = re.sub(r"<ref\s+name=\"[^\"]+\"\s*/>", "", title) 

1440 # Remove references (<ref>...</ref>). 

1441 title = re.sub(r"(?is)<ref\b\s*[^>/]*?>\s*.*?</ref\s*>", "", title) 

1442 # Replace <span>...</span> by stripped content without newlines 

1443 title = re.sub( 

1444 r"(?is)<span\b\s*[^>]*?>(.*?)\s*</span\s*>", 

1445 lambda m: re.sub(r"\s+", " ", m.group(1)), 

1446 title, 

1447 ) 

1448 # Replace <br/> by comma space (it is used to express alternatives in some 

1449 # declensions) 

1450 title = re.sub(r"(?si)\s*<br\s*/?>\n*", "\n", title) 

1451 # Remove divs with floatright class (generated e.g. by {{ja-kanji|...}}) 

1452 title = re.sub( 

1453 r'(?si)<div\b[^>]*?\bclass="[^"]*?\bfloatright\b[^>]*?>' 

1454 r"((<div\b(<div\b.*?</div\s*>|.)*?</div>)|.)*?" 

1455 r"</div\s*>", 

1456 "", 

1457 title, 

1458 ) 

1459 # Remove divs with float: attribute 

1460 title = re.sub( 

1461 r'(?si)<div\b[^>]*?\bstyle="[^"]*?\bfloat:[^>]*?>' 

1462 r"((<div\b(<div\b.*?</div\s*>|.)*?</div>)|.)*?" 

1463 r"</div\s*>", 

1464 "", 

1465 title, 

1466 ) 

1467 # Remove <sup> with previewonly class (generated e.g. by {{taxlink|...}}) 

1468 title = re.sub( 

1469 r'(?si)<sup\b[^>]*?\bclass="[^"<>]*?' 

1470 r"\bpreviewonly\b[^>]*?>" 

1471 r".+?</sup\s*>", 

1472 "", 

1473 title, 

1474 ) 

1475 # Remove <strong class="error">...</strong> 

1476 title = re.sub( 

1477 r'(?si)<strong\b[^>]*?\bclass="[^"]*?\berror\b[^>]*?>' 

1478 r".+?</strong\s*>", 

1479 "", 

1480 title, 

1481 ) 

1482 # Change <div> and </div> to newlines. Ditto for tr, li, table, dl, ul, ol 

1483 title = re.sub(r"(?si)</?(div|tr|li|table|dl|ul|ol)\b[^>]*>", "\n", title) 

1484 # Change <dt>, <dd>, </dt> and </dd> into newlines; 

1485 # these generate new rows/lines. 

1486 title = re.sub(r"(?i)</?d[dt]\s*>", "\n", title) 

1487 # Change <td> </td> to spaces. Ditto for th. 

1488 title = re.sub(r"(?si)</?(td|th)\b[^>]*>", " ", title) 

1489 # Change <sup> ... </sup> to ^ 

1490 title = re.sub(r"(?si)<sup\b[^>]*>\s*</sup\s*>", "", title) 

1491 title = re.sub(r"(?si)<sup\b[^>]*>(.*?)</sup\s*>", repl_1_sup, title) 

1492 # Change <sub> ... </sub> to _ 

1493 title = re.sub(r"(?si)<sub\b[^>]*>\s*</sub\s*>", "", title) 

1494 title = re.sub(r"(?si)<sub\b[^>]*>(.*?)</sub\s*>", repl_1_sub, title) 

1495 # Change <chem> ... </chem> using subscripts for digits 

1496 title = re.sub(r"(?si)<chem\b[^>]*>(.*?)</chem\s*>", repl_1_chem, title) 

1497 # Change <math> ... </math> using special formatting. 

1498 title = re.sub(r"(?si)<math\b[^>]*>(.*?)</math\s*>", repl_1_math, title) 

1499 # Change <syntaxhighlight> ... </syntaxhighlight> using special formatting. 

1500 title = re.sub( 

1501 r"(?si)<syntaxhighlight\b[^>]*>(.*?)" r"</syntaxhighlight\s*>", 

1502 repl_1_syntaxhighlight, 

1503 title, 

1504 ) 

1505 # Remove any remaining HTML tags. 

1506 if not no_html_strip: 

1507 title = re.sub(r"(?s)<[/!a-zA-Z][^>]*>", "", title) 

1508 title = re.sub(r"(?s)</[^>]+>", "", title) 

1509 else: 

1510 # Strip <noinclude/> anyway 

1511 title = re.sub(r"(?si)<noinclude\s*/\s*>", "", title) 

1512 # Replace [...] 

1513 title = re.sub(r"(?s)\[\s*\.\.\.\s*\]", "…", title) 

1514 # Remove http links in superscript 

1515 title = re.sub(r"\^\(\[?(https?:)?//[^]()]+\]?\)", "", title) 

1516 # Remove any edit links to local pages 

1517 title = re.sub(r"\[//[^]\s]+\s+edit\s*\]", "", title) 

1518 # Replace links by their text 

1519 

1520 category_ns_data: NamespaceDataEntry 

1521 # XXX "Category" -> config variable for portability 

1522 category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item] 

1523 # Fail if we received empty dict from .get() 

1524 category_ns_names = {"Category", category_ns_data["name"]} | set( 

1525 category_ns_data["aliases"] 

1526 ) 

1527 category_names_pattern = rf"(?:{'|'.join(category_ns_names)})" 

1528 while True: 

1529 # Links may be nested, so keep replacing until there is no more change. 

1530 orig = title 

1531 title = re.sub( 

1532 rf"(?si)\s*\[\[\s*{category_names_pattern}\s*:\s*([^]]+?)\s*\]\]", 

1533 "", 

1534 title, 

1535 ) 

1536 title = re.sub( 

1537 r"(?s)\[\[\s*:?([^]|#<>:]+?)\s*(#[^][|<>]*?)?\]\]", repl_1, title 

1538 ) 

1539 title = re.sub( 

1540 r"(?s)\[\[\s*(([\w\d]+)\s*:)?\s*([^][#|<>]+?)" 

1541 r"\s*(#[^][|]*?)?\|?\]\]", 

1542 repl_link, 

1543 title, 

1544 ) 

1545 title = re.sub( 

1546 r"(?s)\[\[\s*([^][|<>]+?)\s*\|" 

1547 r"\s*(([^][|]|\[[^]]*\])+?)" 

1548 r"(\s*\|\s*(([^][|]|\[[^]]*\])+?))*\s*\]\]", 

1549 repl_link_bars, 

1550 title, 

1551 ) 

1552 if title == orig: 

1553 break 

1554 # Replace remaining HTML links by the URL. 

1555 while True: 

1556 orig = title 

1557 title = re.sub( 

1558 r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]", repl_exturl, title 

1559 ) 

1560 if title == orig: 

1561 break 

1562 

1563 # Remove italic and bold 

1564 title = remove_italic_and_bold(title) 

1565 

1566 # Replace HTML entities 

1567 title = html.unescape(title) 

1568 title = title.replace("\xa0", " ") # nbsp 

1569 # Remove left-to-right and right-to-left, zero-with characters 

1570 title = re.sub(r"[\u200e\u200f\u200b\u200d\u200c\ufeff]", "", title) 

1571 # Replace whitespace sequences by a single space. 

1572 title = re.sub(r"[ \t\r]+", " ", title) 

1573 title = re.sub(r" *\n+", "\n", title) 

1574 # Eliminate spaces around ellipsis in brackets 

1575 title = re.sub(r"\[\s*…\s*\]", "[…]", title) 

1576 

1577 # This unicode quote seems to be used instead of apostrophe quite randomly 

1578 # (about 4% of apostrophes in English entries, some in Finnish entries). 

1579 # title = re.sub("\u2019", "'", title) # Note: no r"..." here! 

1580 # Replace strange unicode quotes with normal quotes 

1581 # title = re.sub(r"”", '"', title) 

1582 # Replace unicode long dash by normal dash 

1583 # title = re.sub(r"–", "-", title) 

1584 

1585 # Remove whitespace before periods and commas etc 

1586 # XXX we might re-enable this, now trying without as it is removing some 

1587 # instances where we would want to leave the space 

1588 # title = re.sub(r" ([.,;:!?)])", repl_1, title) 

1589 # Strip surrounding whitespace. 

1590 if not no_strip: 

1591 title = title.strip() 

1592 # Normalize different ways of writing accents into the NFC canonical form 

1593 title = unicodedata.normalize("NFC", title) 

1594 return title 

1595 

1596 

1597def clean_template_args( 

1598 wxr: WiktextractContext, 

1599 ht: Union[TemplateArgs, TemplateParameters], 

1600 no_strip=False, 

1601) -> dict[Union[str, int], str]: 

1602 """Cleans all values in a template argument dictionary and returns the 

1603 cleaned dictionary.""" 

1604 assert isinstance(wxr, WiktextractContext) 

1605 assert isinstance(ht, dict) 

1606 return { 

1607 clean_value(wxr, str(k), no_html_strip=True): clean_value( 

1608 wxr, str(v), no_strip=no_strip, no_html_strip=True 

1609 ) 

1610 for k, v in ht.items() 

1611 }