Coverage for src/wiktextract/clean.py: 90%

1# This file contains code to clean Wiktionary annotations from a string and to

2# produce plain text from it, typically for glossary entries but this is also

3# called for various other data to produce clean strings.

5# This file also contains code for cleaning qualifiers for the "tags" field.

9import html

10import re

11import unicodedata

12from typing import Callable, Optional, Union

14from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST, URL_STARTS

15from wikitextprocessor.core import NamespaceDataEntry, TemplateArgs

16from wikitextprocessor.parser import TemplateParameters

18from .wxr_context import WiktextractContext

20######################################################################

21# Cleaning values into plain text.

22######################################################################

24superscript_ht: dict[str, str] = {

25 "0": "⁰",

26 "1": "¹",

27 "2": "²",

28 "3": "³",

29 "4": "⁴",

30 "5": "⁵",

31 "6": "⁶",

32 "7": "⁷",

33 "8": "⁸",

34 "9": "⁹",

35 "+": "⁺",

36 "-": "⁻",

37 "−": "⁻",

38 "‐": "⁻",

39 "–": "⁻",

40 "—": "⁻",

41 "一": "⁻",

42 "=": "⁼",

43 "(": "⁽",

44 ")": "⁾",

45 "A": "ᴬ",

46 "B": "ᴮ",

47 "D": "ᴰ",

48 "E": "ᴱ",

49 "G": "ᴳ",

50 "H": "ᴴ",

51 "I": "ᴵ",

52 "J": "ᴶ",

53 "K": "ᴷ",

54 "L": "ᴸ",

55 "M": "ᴹ",

56 "N": "ᴺ",

57 "O": "ᴼ",

58 "P": "ᴾ",

59 "R": "ᴿ",

60 "T": "ᵀ",

61 "U": "ᵁ",

62 "V": "ⱽ",

63 "W": "ᵂ",

64 "a": "ᵃ",

65 "b": "ᵇ",

66 "c": "ᶜ",

67 "d": "ᵈ",

68 "e": "ᵉ",

69 "f": "ᶠ",

70 "g": "ᵍ",

71 "h": "ʰ",

72 "i": "ⁱ",

73 "j": "ʲ",

74 "k": "ᵏ",

75 "l": "ˡ",

76 "m": "ᵐ",

77 "n": "ⁿ",

78 "o": "ᵒ",

79 "p": "ᵖ",

80 "r": "ʳ",

81 "s": "ˢ",

82 "t": "ᵗ",

83 "u": "ᵘ",

84 "v": "ᵛ",

85 "w": "ʷ",

86 "x": "ˣ",

87 "y": "ʸ",

88 "z": "ᶻ",

89 "β": "ᵝ",

90 "γ": "ᵞ",

91 "δ": "ᵟ",

92 "θ": "ᶿ",

93 "ι": "ᶥ",

94 "φ": "ᵠ",

95 "χ": "ᵡ",

96 "∞": "\u2002᪲", # This is a KLUDGE

97}

99subscript_ht: dict[str, str] = {

100 "0": "₀",

101 "1": "₁",

102 "2": "₂",

103 "3": "₃",

104 "4": "₄",

105 "5": "₅",

106 "6": "₆",

107 "7": "₇",

108 "8": "₈",

109 "9": "₉",

110 "+": "₊",

111 "-": "₋",

112 "−": "₋",

113 "=": "₌",

114 "(": "₍",

115 ")": "₎",

116 "a": "ₐ",

117 "e": "ₑ",

118 "h": "ₕ",

119 "i": "ᵢ",

120 "j": "ⱼ",

121 "k": "ₖ",

122 "l": "ₗ",

123 "m": "ₘ",

124 "n": "ₙ",

125 "o": "ₒ",

126 "p": "ₚ",

127 "r": "ᵣ",

128 "s": "ₛ",

129 "t": "ₜ",

130 "u": "ᵤ",

131 "v": "ᵥ",

132 "x": "ₓ",

133 "ə": "ₔ",

134 "ρ": "ᵨ",

135 "φ": "ᵩ",

136 "χ": "ᵪ",

137}

138

139

140def to_superscript(text: str) -> str:

141 "Converts text to superscript."

142 if not text:

143 return ""

144 if all(x in superscript_ht for x in text):

145 return "".join(superscript_ht[x] for x in text)

146 if len(text) == 1:

147 return "^" + text

148 return "^({})".format(text)

149

150

151def to_subscript(text: str) -> str:

152 """Converts text to subscript."""

153 if not text: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 return ""

155 if all(x in subscript_ht for x in text):

156 return "".join(subscript_ht[x] for x in text)

157 if len(text) == 1: 157 ↛ 159line 157 didn't jump to line 159 because the condition on line 157 was always true

158 return "_" + text

159 return "_({})".format(text)

160

161

162def to_chem(text: str) -> str:

163 """Converts text to chemical formula, making digits subscript."""

164 return "".join(to_subscript(x) if x.isdigit() else x for x in text)

165

166

167# Mapping from Latex names to Unicode characters/strings. This is the

168# default mapping (some cases are handled specially in the code).

169math_map: dict[str, str] = {

170 # XXX should probably change greek characters to non-slanted ones?

171 "AC": "∿",

172 "APLcomment": "⍝",

173 "APLdownarrowbox": "⍗",

174 "APLinput": "⍞",

175 "APLinv": "⌹",

176 "APLleftarrowbox": "⍇",

177 "APLlog": "⍟",

178 "APLrightarrowbox": "⍈",

179 "APLuparrowbox": "⍐",

180 "Angstroem": "Å",

181 "Bot": "⫫",

182 "Box": "□",

183 "Bumpeq": "≎",

184 "CIRCLE": "●",

185 "Cap": "⋒",

186 "CapitalDifferentialD": "ⅅ",

187 "CheckedBox": "☑",

188 "Circle": "○",

189 "Coloneqq": "⩴",

190 "ComplexI": "ⅈ",

191 "ComplexJ": "ⅉ",

192 "Cup": "⋓",

193 "Delta": "Δ",

194 "Diamond": "◇",

195 "Diamondblack": "◆",

196 "Diamonddot": "⟐",

197 "DifferentialD": "ⅆ",

198 "Digamma": "Ϝ",

199 "Doteq": "≑",

200 "DownArrowBar": "⤓",

201 "DownLeftTeeVector": "⥞",

202 "DownLeftVectorBar": "⥖",

203 "DownRightTeeVector": "⥟",

204 "DownRightVectorBar": "⥗",

205 "Downarrow": "⇓",

206 "Equal": "⩵",

207 "Euler": "Ɛ",

208 "ExponentialE": "ⅇ",

209 "ExponetialE": "ⅇ",

210 "Finv": "Ⅎ",

211 "Gamma": "Γ",

212 "Im": "ℑ",

213 "Join": "⨝",

214 "Koppa": "Ϟ",

215 "LEFTCIRCLE": "◖",

216 "LEFTcircle": "◐",

217 "LHD": "◀",

218 "LVec": "x⃖",

219 "Lambda": "Λ",

220 "Lbag": "⟅",

221 "LeftArrowBar": "⇤",

222 "LeftDownTeeVector": "⥡",

223 "LeftDownVectorBar": "⥙",

224 "LeftTeeVector": "⥚",

225 "LeftTriangleBar": "⧏",

226 "LeftUpTeeVector": "⥠",

227 "LeftUpVectorBar": "⥘",

228 "LeftVectorBar": "⥒",

229 "Leftarrow": "⇐",

230 "Leftrightarrow": "⇔",

231 "Lleftarrow": "⇚",

232 "Longleftarrow": "⟸",

233 "Longleftrightarrow": "⟺",

234 "Longmapsfrom": "⟽",

235 "Longmapsto": "⟾",

236 "Longrightarrow": "⟹",

237 "Lparen": "⦅",

238 "Lsh": "↰",

239 "MapsDown": "↧",

240 "MapsUp": "↥",

241 "Mapsfrom": "⤆",

242 "Mapsto": "⤇",

243 "Micro": "µ",

244 "Nearrow": "⇗",

245 "NestedGreaterGreater": "⪢",

246 "NestedLessLess": "⪡",

247 "NotGreaterLess": "≹",

248 "NotGreaterTilde": "≵",

249 "NotLessTilde": "≴",

250 "Nwarrow": "⇖",

251 "Omega": "Ω",

252 "Phi": "Φ",

253 "Pi": "Π",

254 "Proportion": "∷",

255 "Psi": "Ψ",

256 "Qoppa": "Ϙ",

257 "RHD": "▶",

258 "RIGHTCIRCLE": "◗",

259 "RIGHTcircle": "◑",

260 "Rbag": "⟆",

261 "Re": "ℜ",

262 "RightArrowBar": "⇥",

263 "RightDownTeeVector": "⥝",

264 "RightDownVectorBar": "⥕",

265 "RightTeeVector": "⥛",

266 "RightTriangleBar": "⧐",

267 "RightUpTeeVector": "⥜",

268 "RightUpVectorBar": "⥔",

269 "RightVectorBar": "⥓",

270 "Rightarrow": "⇒",

271 "Rparen": "⦆",

272 "Rrightarrow": "⇛",

273 "Rsh": "↱",

274 "S": "§",

275 "Same": "⩶",

276 "Sampi": "Ϡ",

277 "Searrow": "⇘",

278 "Sigma": "Σ",

279 "Square": "☐",

280 "Stigma": "Ϛ",

281 "Subset": "⋐",

282 "Sun": "☉",

283 "Supset": "⋑",

284 "Swarrow": "⇙",

285 "Theta": "Θ",

286 "Top": "⫪",

287 "UpArrowBar": "⤒",

288 "Uparrow": "⇑",

289 "Updownarrow": "⇕",

290 "Upsilon": "Υ",

291 "VDash": "⊫",

292 "VERT": "⦀",

293 "Vdash": "⊩",

294 "Vert": "‖",

295 "Vvdash": "⊪",

296 "XBox": "☒",

297 "Xi": "Ξ",

298 "Yup": "⅄",

299 "_": "_",

300 "aleph": "א",

301 "alpha": "α",

302 "amalg": "⨿",

303 "anchor": "⚓",

304 "angle": "∠",

305 "approx": "≈",

306 "approxeq": "≊",

307 "aquarius": "♒",

308 "arg": "arg",

309 "aries": "♈",

310 "arrowbullet": "➢",

311 "ast": "∗",

312 "asymp": "≍",

313 "backepsilon": "϶",

314 "backprime": "‵",

315 "backsim": "∽",

316 "backsimeq": "⋍",

317 "backslash": "",

318 "ballotx": "✗",

319 "barin": "⋶",

320 "barleftharpoon": "⥫",

321 "barrightharpoon": "⥭",

322 "barwedge": "⊼",

323 "because": "∵",

324 "beta": "β",

325 "beth": "ב",

326 "between": "≬",

327 "bigcap": "∩",

328 "bigcup": "∪",

329 "biginterleave": "⫼",

330 "bigodot": "⨀",

331 "bigoplus": "⨁",

332 "bigotimes": "⨂",

333 "bigsqcap": "⨅",

334 "bigsqcup": "⨆",

335 "bigstar": "★",

336 "bigtriangledown": "▽",

337 "bigtriangleup": "△",

338 "biguplus": "⨄",

339 "bigvee": "∨",

340 "bigwedge": "∧",

341 "bij": "⤖",

342 "biohazard": "☣",

343 "blacklozenge": "⧫",

344 "blacksmiley": "☻",

345 "blacksquare": "■",

346 "blacktriangledown": "▾",

347 "blacktriangleleft": "◂",

348 "blacktriangleright": "▸",

349 "blacktriangleup": "▴",

350 "bot": "⊥",

351 "bowtie": "⋈",

352 "boxast": "⧆",

353 "boxbar": "◫",

354 "boxbox": "⧈",

355 "boxbslash": "⧅",

356 "boxcircle": "⧇",

357 "boxdot": "⊡",

358 "boxminus": "⊟",

359 "boxplus": "⊞",

360 "boxslash": "⧄",

361 "boxtimes": "⊠",

362 "bullet": "•",

363 "bumpeq": "≏",

364 "cancer": "♋",

365 "cap": "∩",

366 "capricornus": "♑",

367 "capwedge": "⩄",

368 "cat": "⁀",

369 "cdot": "·",

370 "cdots": "⋯",

371 "cent": "¢",

372 "checkmark": "✓",

373 "chi": "χ",

374 "circ": "∘",

375 "circeq": "≗",

376 "circlearrowleft": "↺",

377 "circlearrowright": "↻",

378 "circledR": "®",

379 "circledast": "⊛",

380 "circledbslash": "⦸",

381 "circledcirc": "⊚",

382 "circleddash": "⊝",

383 "circledgtr": "⧁",

384 "circledless": "⧀",

385 "clubsuit": "♣",

386 "colon": ":",

387 "coloneq": "≔",

388 "complement": "∁",

389 "cong": "≅",

390 "coprod": "∐",

391 "corresponds": "≙",

392 "cup": "∪",

393 "curlyeqprec": "⋞",

394 "curlyeqsucc": "⋟",

395 "curlyvee": "⋎",

396 "curlywedge": "⋏",

397 "curvearrowleft": "↶",

398 "curvearrowright": "↷",

399 "dagger": "†",

400 "daleth": "ד",

401 "dashleftarrow": "⇠",

402 "dashrightarrow": "⇢",

403 "dashv": "⊣",

404 "ddagger": "‡",

405 "delta": "δ",

406 "diameter": "∅",

407 "diamond": "⋄",

408 "diamondsuit": "♢",

409 "digamma": "ϝ",

410 "div": "÷",

411 "divideontimes": "⋇",

412 "dlsh": "↲",

413 "dot\\bigvee": "⩒",

414 "dot\\cap": "⩀",

415 "dot\\cup": "⊍",

416 "dot\\lor": "⩒",

417 "dot\\vee": "⩒",

418 "doteq": "≐",

419 "dotplus": "∔",

420 "dots": "…",

421 "doublebarwedge": "⩞",

422 "downarrow": "↓",

423 "downdownarrows": "⇊",

424 "downdownharpoons": "⥥",

425 "downharpoonleft": "⇃",

426 "downharpoonright": "⇂",

427 "downuparrows": "⇵",

428 "downupharpoons": "⥯",

429 "drsh": "↳",

430 "dsub": "⩤",

431 "earth": "♁",

432 "eighthnote": "♪",

433 "ell": "ℓ",

434 "emptyset": "∅",

435 "epsilon": "ϵ",

436 "eqcirc": "≖",

437 "eqcolon": "∹",

438 "eqsim": "≂",

439 "eqslantgtr": "⪖",

440 "eqslantless": "⪕",

441 "equiv": "≡",

442 "eta": "η",

443 "eth": "ð",

444 "exists": "∃",

445 "fallingdotseq": "≒",

446 "fcmp": "⨾",

447 "female": "♀",

448 "ffun": "⇻",

449 "finj": "⤕",

450 "fint": "⨏",

451 "flat": "♭",

452 "footnotesize": "",

453 "forall": "∀",

454 "fourth": "⁗",

455 "frown": "⌢",

456 "frownie": "☹",

457 "gamma": "γ",

458 "ge": ">",

459 "gemini": "♊",

460 "geq": "≥",

461 "geqq": "≧",

462 "geqslant": "⩾",

463 "gg": "≫",

464 "ggcurly": "⪼",

465 "ggg": "⋙",

466 "gimel": "ג",

467 "gnapprox": "⪊",

468 "gneq": "⪈",

469 "gneqq": "≩",

470 "gnsim": "⋧",

471 "gtrapprox": "⪆",

472 "gtrdot": "⋗",

473 "gtreqless": "⋛",

474 "gtreqqless": "⪌",

475 "gtrless": "≷",

476 "gtrsim": "≳",

477 "hash": "⋕",

478 "heartsuit": "♡",

479 "hookleftarrow": "↩",

480 "hookrightarrow": "↪",

481 "hslash": "ℏ",

482 "iddots": "⋰",

483 "iff": "⟺",

484 "iiiint": "⨌",

485 "iiint": "∭",

486 "iint": "∬",

487 "imath": "ı",

488 "implies": "⟹",

489 "in": "∈",

490 "infty": "∞",

491 "int": "∫",

492 "intercal": "⊺",

493 "interleave": "⫴",

494 "invamp": "⅋",

495 "invdiameter": "⍉",

496 "invneg": "⌐",

497 "iota": "ι",

498 "jmath": "ȷ",

499 "jupiter": "♃",

500 "kappa": "κ",

501 "koppa": "ϟ",

502 "lambda": "λ",

503 "land": "∧",

504 "lang": "⟪",

505 "langle": "⟨",

506 "large": "",

507 "lblot": "⦉",

508 "lbrace": "{",

509 "lbrack": "[",

510 "lceil": "⌈",

511 "ldots": "…",

512 "le": "<",

513 "leadsto": "⤳",

514 "leftarrow": "←",

515 "leftarrowtail": "↢",

516 "leftarrowtriangle": "⇽",

517 "leftbarharpoon": "⥪",

518 "leftharpoondown": "↽",

519 "leftharpoonup": "↼",

520 "leftleftarrows": "⇇",

521 "leftleftharpoons": "⥢",

522 "leftmoon": "☾",

523 "leftrightarrow": "↔",

524 "leftrightarrows": "⇆",

525 "leftrightarrowtriangle": "⇿",

526 "leftrightharpoon": "⥊",

527 "leftrightharpoondown": "⥐",

528 "leftrightharpoons": "⇋",

529 "leftrightharpoonup": "⥎",

530 "leftrightsquigarrow": "↭",

531 "leftslice": "⪦",

532 "leftsquigarrow": "⇜",

533 "leftthreetimes": "⋋",

534 "leftupdownharpoon": "⥑",

535 "leo": "♌",

536 "leq": "≤",

537 "leqq": "≦",

538 "leqslant": "⩽",

539 "lessapprox": "⪅",

540 "lessdot": "⋖",

541 "lesseqgtr": "⋚",

542 "lesseqqgtr": "⪋",

543 "lessgtr": "≶",

544 "lessim": "≲",

545 "lesssim": "≲",

546 "lfloor": "⌊",

547 "lgroup": "⟮",

548 "lhd": "◁",

549 "libra": "♎",

550 "lightning": "↯",

551 "limg": "⦇",

552 "ll": "≪",

553 "llbracket": "⟦",

554 "llcorner": "⌞",

555 "llcurly": "⪻",

556 "lll": "⋘",

557 "lnapprox": "⪉",

558 "lneq": "⪇",

559 "lneqq": "≨",

560 "lnot": "¬",

561 "lnsim": "⋦",

562 "longleftarrow": "⟵",

563 "longleftrightarrow": "⟷",

564 "longmapsfrom": "⟻",

565 "longmapsto": "⟼",

566 "longrightarrow": "⟶",

567 "looparrowleft": "↫",

568 "looparrowright": "↬",

569 "lor": "∨",

570 "lozenge": "◊",

571 "lrcorner": "⌟",

572 "ltimes": "⋉",

573 "male": "♂",

574 "maltese": "✠",

575 "mapsfrom": "↤",

576 "mapsto": "↦",

577 "measuredangle": "∡",

578 "medbullet": "⚫",

579 "medcirc": "⚪",

580 "mercury": "☿",

581 "mho": "℧",

582 "mid": "∣",

583 "mlcp": "⫛",

584 "mod": " mod ",

585 "models": "⊧",

586 "mp": "∓",

587 "mu": "μ",

588 "multimap": "⊸",

589 "multimapboth": "⧟",

590 "multimapdotbothA": "⊶",

591 "multimapdotbothB": "⊷",

592 "multimapinv": "⟜",

593 "nLeftarrow": "⇍",

594 "nLeftrightarrow": "⇎",

595 "nRightarrow": "⇏",

596 "nVDash": "⊯",

597 "nVdash": "⊮",

598 "nabla": "∇",

599 "napprox": "≉",

600 "natural": "♮",

601 "ncong": "≇",

602 "nearrow": "↗",

603 "neg": "¬",

604 "neptune": "♆",

605 "neq": "≠",

606 "nequiv": "≢",

607 "nexists": "∄",

608 "ngeq": "≱",

609 "ngtr": "≯",

610 "ni": "∋",

611 "nleftarrow": "↚",

612 "nleftrightarrow": "↮",

613 "nleq": "≰",

614 "nless": "≮",

615 "nmid": "∤",

616 "nni": "∌",

617 "normalsize": "",

618 "not\\in": "∉",

619 "not\\ni": "∌",

620 "not\\preceq": "⋠",

621 "not\\subset": "⊄",

622 "not\\subseteq": "⊈",

623 "not\\succeq": "⋡",

624 "not\\supset": "⊅",

625 "not\\supseteq": "⊉",

626 "not\\trianglelefteq": "⋬",

627 "not\\trianglerighteq": "⋭",

628 "not\\vartriangleleft": "⋪",

629 "not\\vartriangleright": "⋫",

630 "notasymp": "≭",

631 "notbackslash": "⍀",

632 "notin": "∉",

633 "notslash": "⌿",

634 "nparallel": "∦",

635 "nprec": "⊀",

636 "npreceq": "⋠",

637 "nrightarrow": "↛",

638 "nsim": "≁",

639 "nsimeq": "≄",

640 "nsqsubseteq": "⋢",

641 "nsqsupseteq": "⋣",

642 "nsubset": "⊄",

643 "nsubseteq": "⊈",

644 "nsucc": "⊁",

645 "nsucceq": "⋡",

646 "nsupset": "⊅",

647 "nsupseteq": "⊉",

648 "ntriangleleft": "⋪",

649 "ntrianglelefteq": "⋬",

650 "ntriangleright": "⋫",

651 "ntrianglerighteq": "⋭",

652 "nu": "ν",

653 "nvDash": "⊭",

654 "nvdash": "⊬",

655 "nwarrow": "↖",

656 "odot": "⊙",

657 "oiiint": "∰",

658 "oiint": "∯",

659 "oint": "∮",

660 "ointctrclockwise": "∳",

661 "omega": "ω",

662 "ominus": "⊖",

663 "oplus": "⊕",

664 "oslash": "⊘",

665 "otimes": "⊗",

666 "over": "/",

667 "overbrace": "⏞",

668 "overleftrightarrow": "x⃡",

669 "overparen": "⏜",

670 "overset?=": "≟",

671 "overset{?}{=}": "≟",

672 "overset{\\operatorname{def}}{=}": "≝",

673 "parallel": "∥",

674 "partial": "∂",

675 "pencil": "✎",

676 "perp": "⊥",

677 "pfun": "⇸",

678 "phi": "ϕ",

679 "pi": "π",

680 "pinj": "⤔",

681 "pisces": "♓",

682 "pitchfork": "⋔",

683 "pluto": "♇",

684 "pm": "±",

685 "pointright": "☞",

686 "pounds": "£",

687 "prec": "≺",

688 "precapprox": "⪷",

689 "preccurlyeq": "≼",

690 "preceq": "⪯",

691 "preceqq": "⪳",

692 "precnapprox": "⪹",

693 "precnsim": "⋨",

694 "precsim": "≾",

695 "prime": "′",

696 "prod": "∏",

697 "propto": "∝",

698 "psi": "ψ",

699 "psur": "⤀",

700 "qoppa": "ϙ",

701 "quad": " ",

702 "quarternote": "♩",

703 "radiation": "☢",

704 "rang": "⟫",

705 "rangle": "⟩",

706 "rarr": "→",

707 "rblot": "⦊",

708 "rbrace": "}",

709 "rbrack": "]",

710 "rceil": "⌉",

711 "recycle": "♻",

712 "rfloor": "⌋",

713 "rgroup": "⟯",

714 "rhd": "▷",

715 "rho": "ρ",

716 "rightangle": "∟",

717 "rightarrow": "→",

718 "rightarrowtail": "↣",

719 "rightarrowtriangle": "⇾",

720 "rightbarharpoon": "⥬",

721 "rightharpoondown": "⇁",

722 "rightharpoonup": "⇀",

723 "rightleftarrows": "⇄",

724 "rightleftharpoon": "⥋",

725 "rightleftharpoons": "⇌",

726 "rightmoon": "☽",

727 "rightrightarrows": "⇉",

728 "rightrightharpoons": "⥤",

729 "rightslice": "⪧",

730 "rightsquigarrow": "⇝",

731 "rightthreetimes": "⋌",

732 "rightupdownharpoon": "⥏",

733 "rimg": "⦈",

734 "risingdotseq": "≓",

735 "rrbracket": "⟧",

736 "rsub": "⩥",

737 "rtimes": "⋊",

738 "sagittarius": "♐",

739 "sampi": "ϡ",

740 "saturn": "♄",

741 "scorpio": "♏",

742 "scriptsize": "",

743 "searrow": "↘",

744 "second": "″",

745 "setminus": "⧵",

746 "sharp": "♯",

747 "sigma": "σ",

748 "sim": "∼",

749 "simeq": "≃",

750 "sixteenthnote": "♬",

751 "skull": "☠",

752 "slash": "∕",

753 "small": "",

754 "smallsetminus": "∖",

755 "smalltriangledown": "▿",

756 "smalltriangleleft": "◃",

757 "smalltriangleright": "▹",

758 "smalltriangleup": "▵",

759 "smile": "⌣",

760 "smiley": "☺",

761 "spadesuit": "♠",

762 "spddot": "¨",

763 "sphat": "^",

764 "sphericalangle": "∢",

765 "spot": "⦁",

766 "sptilde": "~",

767 "sqcap": "⊓",

768 "sqcup": "⊔",

769 "sqint": "⨖",

770 "sqrt": "√", # ∛ ∜ - partly special handling below

771 "sqrt[3]": "∛",

772 "sqrt[4]": "∜",

773 "sqsubset": "⊏",

774 "sqsubseteq": "⊑",

775 "sqsupset": "⊐",

776 "sqsupseteq": "⊒",

777 "square": "□",

778 "sslash": "⫽",

779 "star": "⋆",

780 "steaming": "☕",

781 "stigma": "ϛ",

782 "strictfi": "⥼",

783 "strictif": "⥽",

784 "subset": "⊂",

785 "subseteq": "⊆",

786 "subseteqq": "⫅",

787 "subsetneq": "⊊",

788 "subsetneqq": "⫋",

789 "succ": "≻",

790 "succapprox": "⪸",

791 "succcurlyeq": "≽",

792 "succeq": "⪰",

793 "succeqq": "⪴",

794 "succnapprox": "⪺",

795 "succnsim": "⋩",

796 "succsim": "≿",

797 "sum": "∑",

798 "sun": "☼",

799 "supset": "⊃",

800 "supseteq": "⊇",

801 "supseteqq": "⫆",

802 "supsetneq": "⊋",

803 "supsetneqq": "⫌",

804 "swarrow": "↙",

805 "swords": "⚔",

806 "talloblong": "⫾",

807 "tau": "τ",

808 "taurus": "♉",

809 "tcohm": "Ω",

810 "textbackslash": "\\",

811 "textbar": "|",

812 "textbullet": "•",

813 "textgreater": ">",

814 "textless": "<",

815 "textprime": "′",

816 "therefore": "∴",

817 "theta": "θ",

818 "third": "‴",

819 "times": "×",

820 "tiny": "",

821 "to": "→",

822 "top": "⊤",

823 "triangle": "∆",

824 "trianglelefteq": "⊴",

825 "triangleq": "≜",

826 "trianglerighteq": "⊵",

827 "twoheadleftarrow": "↞",

828 "twoheadrightarrow": "↠",

829 "twonotes": "♫",

830 "ulcorner": "⌜",

831 "underbar": " ̱",

832 "underbrace": "⏟",

833 "underleftarrow": "x⃮",

834 "underline": " ̲",

835 "underparen": "⏝",

836 "underrightarrow": "x⃯",

837 "uparrow": "↑",

838 "updownarrow": "↕",

839 "updownarrows": "⇅",

840 "updownharpoons": "⥮",

841 "upharpoonleft": "↿",

842 "upharpoonright": "↾",

843 "uplus": "⊎",

844 "upsilon": "υ",

845 "upuparrows": "⇈",

846 "upupharpoons": "⥣",

847 "uranus": "♅",

848 "urcorner": "⌝",

849 "utilde": " ̰",

850 "vDash": "⊨",

851 "varbeta": "β",

852 "varclubsuit": "♧",

853 "vardiamondsuit": "♦",

854 "varepsilon": "ε",

855 "varheartsuit": "♥",

856 "varkappa": "ϰ",

857 "varnothing": "∅",

858 "varointclockwise": "∲",

859 "varphi": "φ",

860 "varpi": "ϖ",

861 "varprod": "⨉",

862 "varrho": "ϱ",

863 "varsigma": "ς",

864 "varspadesuit": "♤",

865 "vartheta": "θ",

866 "vartriangleleft": "⊲",

867 "vartriangleright": "⊳",

868 "vdash": "⊢",

869 "vdots": "⋮",

870 "vee": "∨",

871 "veebar": "⊻",

872 "vert": "|",

873 "virgo": "♍",

874 "warning": "⚠",

875 "wasylozenge": "⌑",

876 "wedge": "∧",

877 "widehat=": "≙",

878 "widehat{=}": "≙",

879 "wp": "℘",

880 "wr": "≀",

881 "xi": "ξ",

882 "yen": "¥",

883 "yinyang": "☯",

884 "zcmp": "⨟",

885 "zeta": "ζ",

886 "zhide": "⧹",

887 "zpipe": "⨠",

888 "zproject": "⨡",

889 "|": "‖",

890 # Accents XXX these really should be handled specially with diacritics

891 # after argument

892 "acute": "́",

893 "bar": "̄",

894 "breve": "̆",

895 "check": "̌",

896 "ddddot": "⃜",

897 "dddot": "⃛",

898 "ddot": "̈",

899 "ddots": "⋱",

900 "dot": "̇",

901 "grave": "̀",

902 "hat": "̂",

903 "lvec": "⃐",

904 "mathring": "̊",

905 "not": "̸",

906 "overline": "◌̅",

907 "tilde": "̃",

908 "vec": "⃑",

909 # Some ignored operators

910 "bigl": "",

911 "bigr": "",

912 "left": "",

913 "right": "",

914 "style": "",

915 "textstyle": "",

916 "mathrm": "",

917}

918

919mathcal_map: dict[str, str] = {

920 "A": "𝒜",

921 "B": "ℬ",

922 "C": "𝒞",

923 "D": "𝒟",

924 "E": "ℰ",

925 "F": "ℱ",

926 "G": "𝒢",

927 "H": "ℋ",

928 "I": "ℐ",

929 "J": "𝒥",

930 "K": "𝒦",

931 "L": "ℒ",

932 "M": "ℳ",

933 "N": "𝒩",

934 "O": "𝒪",

935 "P": "𝒫",

936 "Q": "𝒬",

937 "R": "ℛ",

938 "S": "𝒮",

939 "T": "𝒯",

940 "U": "𝒰",

941 "V": "𝒱",

942 "W": "𝒲",

943 "X": "𝒳",

944 "Y": "𝒴",

945 "Z": "𝒵",

946 "a": "𝒶",

947 "b": "𝒷",

948 "c": "𝒸",

949 "d": "𝒹",

950 "e": "ℯ",

951 "f": "𝒻",

952 "g": "ℊ",

953 "h": "𝒽",

954 "i": "𝒾",

955 "j": "𝒿",

956 "k": "𝓀",

957 "l": "𝓁",

958 "m": "𝓂",

959 "n": "𝓃",

960 "o": "ℴ",

961 "p": "𝓅",

962 "q": "𝓆",

963 "r": "𝓇",

964 "s": "𝓈",

965 "t": "𝓉",

966 "u": "𝓊",

967 "v": "𝓋",

968 "w": "𝓌",

969 "x": "𝓍",

970 "y": "𝓎",

971 "z": "𝓏",

972}

973

974mathfrak_map: dict[str, str] = {

975 "A": "𝔄",

976 "B": "𝔅",

977 "C": "ℭ",

978 "D": "𝔇",

979 "E": "𝔈",

980 "F": "𝔉",

981 "G": "𝔊",

982 "H": "ℌ",

983 "J": "𝔍",

984 "K": "𝔎",

985 "L": "𝔏",

986 "M": "𝔐",

987 "N": "𝔑",

988 "O": "𝔒",

989 "P": "𝔓",

990 "Q": "𝔔",

991 "S": "𝔖",

992 "T": "𝔗",

993 "U": "𝔘",

994 "V": "𝔙",

995 "W": "𝔚",

996 "X": "𝔛",

997 "Y": "𝔜",

998 "Z": "ℨ",

999}

1000

1001mathbb_map: dict[str, str] = {

1002 "A": "𝔸",

1003 "B": "𝔹",

1004 "C": "ℂ",

1005 "D": "𝔻",

1006 "E": "𝔼",

1007 "F": "𝔽",

1008 "G": "𝔾",

1009 "H": "ℍ",

1010 "I": "𝕀",

1011 "J": "𝕁",

1012 "K": "𝕂",

1013 "L": "𝕃",

1014 "M": "𝕄",

1015 "N": "ℕ",

1016 "O": "𝕆",

1017 "P": "ℙ",

1018 "Q": "ℚ",

1019 "R": "ℝ",

1020 "S": "𝕊",

1021 "T": "𝕋",

1022 "U": "𝕌",

1023 "V": "𝕍",

1024 "W": "𝕎",

1025 "X": "𝕏",

1026 "Y": "𝕐",

1027 "Z": "ℤ",

1028 "a": "𝕒",

1029 "b": "𝕓",

1030 "c": "𝕔",

1031 "d": "𝕕",

1032 "e": "𝕖",

1033 "f": "𝕗",

1034 "g": "𝕘",

1035 "h": "𝕙",

1036 "i": "𝕚",

1037 "j": "𝕛",

1038 "k": "𝕜",

1039 "l": "𝕝",

1040 "m": "𝕞",

1041 "n": "𝕟",

1042 "o": "𝕠",

1043 "p": "𝕡",

1044 "q": "𝕢",

1045 "r": "𝕣",

1046 "s": "𝕤",

1047 "t": "𝕥",

1048 "u": "𝕦",

1049 "v": "𝕧",

1050 "w": "𝕨",

1051 "x": "𝕩",

1052 "y": "𝕪",

1053 "z": "𝕫",

1054 "pi": "ℼ",

1055 "gamma": "ℽ",

1056 "Gamma": "ℾ",

1057 "Pi": "ℿ",

1058 "Sigma": "⅀",

1059 "0": "𝟘",

1060 "1": "𝟙",

1061 "2": "𝟚",

1062 "3": "𝟛",

1063 "4": "𝟜",

1064 "5": "𝟝",

1065 "6": "𝟞",

1066 "7": "𝟟",

1067 "8": "𝟠",

1068 "9": "𝟡",

1069}

1070

1071

1072def mathcal_fn(text: str) -> str:

1073 return "".join(mathcal_map.get(x, x) for x in text)

1074

1075

1076def mathfrak_fn(text: str) -> str:

1077 return "".join(mathfrak_map.get(x, x) for x in text)

1078

1079

1080def mathbb_fn(text: str) -> str:

1081 return "".join(mathbb_map.get(x, x) for x in text)

1082

1083

1084def to_math(text: str) -> str:

1085 """Converts a mathematical formula to ASCII."""

1086 # print("to_math: {!r}".format(text))

1087 magic_vec: list[str] = []

1088

1089 def expand(text: str) -> str:

1090 while True:

1091 orig = text

1092 # formatting with {:c} converts input into character

1093 text = re.sub(

1094 r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST),

1095 lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST],

1096 text,

1097 )

1098 if text == orig:

1099 break

1100 return text

1101

1102 def recurse(text: str) -> str:

1103 def math_magic(

1104 text: str, left: str, right: str, fn: Callable[[str], str]

1105 ) -> str:

1106 regexp_str = r"{}([^{}{}]+){}".format(

1107 re.escape(left),

1108 re.escape(left),

1109 re.escape(right),

1110 re.escape(right),

1111 )

1112 regexp = re.compile(regexp_str)

1113

1114 def repl(m: re.Match) -> str:

1115 magic = chr(MAGIC_FIRST + len(magic_vec))

1116 t = fn(m.group(1)).strip()

1117 magic_vec.append(t)

1118 return magic

1119

1120 while True:

1121 orig = text

1122 text = re.sub(regexp, repl, text)

1123 if text == orig:

1124 break

1125 return text

1126

1127 def expand_group(v: str) -> str:

1128 fn: Optional[Callable[[str], str]] = None

1129 if re.match(r"\\mathcal\b", v):

1130 fn = mathcal_fn

1131 v = v[8:].strip()

1132 elif re.match(r"\\mathfrak\b", v):

1133 fn = mathfrak_fn

1134 v = v[9:].strip()

1135 elif re.match(r"\\mathbb\b", v):

1136 fn = mathbb_fn

1137 v = v[7:]

1138 elif re.match(r"\\(begin|end)\b", v): 1138 ↛ 1139line 1138 didn't jump to line 1139 because the condition on line 1138 was never true

1139 v = "" # Skip

1140 elif re.match(r"\\text\b", v): 1140 ↛ 1141line 1140 didn't jump to line 1141 because the condition on line 1140 was never true

1141 v = v[5:]

1142 elif re.match(r"\\pmod\b", v): 1142 ↛ 1143line 1142 didn't jump to line 1143 because the condition on line 1142 was never true

1143 v = v[5:].strip()

1144 v = "(mod " + expand_group(v) + ")"

1145 elif re.match(r"\\sqrt\[", v): 1145 ↛ 1146line 1145 didn't jump to line 1146 because the condition on line 1145 was never true

1146 a = v[6:-1].strip()

1147 if a == "2":

1148 v = "√"

1149 elif a == "3":

1150 v = "∛"

1151 elif a == "4":

1152 v = "∜"

1153 else:

1154 v = to_superscript(a) + "√"

1155 elif re.match(r"\\sqrt($|[0-9]|\b)", v): 1155 ↛ 1156line 1155 didn't jump to line 1156 because the condition on line 1155 was never true

1156 v = "√"

1157 elif re.match(r"\\(frac|binom)($|[0-9]|\b)", v):

1158 m = re.match(

1159 r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*"

1160 r"(\\[a-zA-Z]+|\\.|.)$",

1161 v,

1162 )

1163 if not m: 1163 ↛ 1164line 1163 didn't jump to line 1164 because the condition on line 1163 was never true

1164 print("MATH FRAC/BINOM ERROR: {!r}".format(v))

1165 return v

1166 op, a, b = m.groups()

1167 a = expand_group(a).strip()

1168 b = expand_group(b).strip()

1169 if len(a) > 1:

1170 a = "(" + a + ")"

1171 if len(b) > 1:

1172 b = "(" + b + ")"

1173 if op == "frac": 1173 ↛ 1175line 1173 didn't jump to line 1175 because the condition on line 1173 was always true

1174 v = a + "/" + b

1175 elif op == "binom":

1176 v = "binom({}, {})".format(a, b)

1177 else:

1178 # Should never get here

1179 v = "{}({})".format(op, v)

1180 elif v.startswith("_"):

1181 fn = to_subscript

1182 v = v[1:]

1183 elif v.startswith("^"):

1184 fn = to_superscript

1185 v = v[1:]

1186 if v.startswith("\\"):

1187 mapped = math_map.get(v[1:].strip())

1188 if mapped is None:

1189 if v[1:].strip().isalnum(): 1189 ↛ 1192line 1189 didn't jump to line 1192 because the condition on line 1189 was always true

1190 v = " " + v[1:].strip() + " "

1191 else:

1192 v = v[1:].strip()

1193 else:

1194 v = mapped

1195 elif v.isspace() or v in ("&",): # Ignore certain special chars 1195 ↛ 1196line 1195 didn't jump to line 1196 because the condition on line 1195 was never true

1196 v = ""

1197 if fn is not None:

1198 v = expand(v)

1199 v = fn(v)

1200 v = expand(v)

1201 return v

1202

1203 parts: list[str] = []

1204 while True:

1205 orig = text

1206 text = math_magic(text, "{", "}", recurse)

1207 if text == orig:

1208 break

1209 for m in re.finditer(

1210 r"\s+|"

1211 r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*"

1212 r"(\\dot\\(bigvee|cup|cap|lor|vee)|"

1213 r"\\not\\(subset|supset|subseteq|supseteq|in|ni|"

1214 r"preceq|succeq|vartrianglelefteq|"

1215 r"vartrianglerighteq|trianglelefteq|"

1216 r"trianglerighteq)|"

1217 r"\\widehat\{=\}|\\widehat=|"

1218 r"\\overset\{?\}\{=\}|"

1219 r"\\overset\?=|"

1220 r"\\overset\{\\operatorname\{def\}\}\{=\}|"

1221 r"\\[a-zA-Z]+|\\.|.)|"

1223 r"\b\s*|"

1224 r"\\sqrt\b(\[\d+\])?)?"

1225 r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)",

1226 text,

1227 ):

1228 v = m.group(0).strip()

1229 if not v:

1230 continue

1231 v = expand_group(v)

1232 if v: 1232 ↛ 1209line 1232 didn't jump to line 1209 because the condition on line 1232 was always true

1233 if (

1234 parts and parts[-1][-1].isalpha() and v[0] in "0123456789"

1235 ) or (

1236 parts

1237 and parts[-1][-1] in "0123456789"

1238 and v[0] in "0123456789"

1239 ):

1240 v = " " + v

1241 parts.append(v)

1242

1243 text = "".join(parts)

1244 return text

1245

1246 text = recurse(text)

1247 # print("math text final: {!r}".format(text))

1248 return text

1249

1250

1251def bold_follows(parts: list[str], i: int) -> bool:

1252 """Checks if there is a bold (''') in parts after parts[i]. We allow

1253 intervening italics ('')."""

1254 parts = parts[i + 1 :]

1255 for p in parts:

1256 if not p.startswith("''"):

1257 continue

1258 if p.startswith("'''"):

1259 return True

1260 return False

1261

1262

1263def remove_italic_and_bold(text: str) -> str:

1264 """Based on token_iter in wikitextprocessor"""

1265 assert isinstance(text, str)

1266 lines = re.split(r"(\n+)", text) # Lines and separators

1267 parts_re = re.compile(r"(''+)")

1268 new_text_parts = []

1269 for line in lines:

1270 parts = re.split(parts_re, line)

1271 state = 0 # 1=in italic 2=in bold 3=in both

1272 for i, part in enumerate(parts):

1273 if part.startswith("''"):

1274 # This is a bold/italic part. Scan the rest of the line

1275 # to determine how it should be interpreted if there are

1276 # more than two apostrophes.

1277 if part.startswith("'''''"):

1278 if state == 1: # in italic

1279 part = part[5:]

1280 state = 2

1281 elif state == 2: # in bold

1282 part = part[5:]

1283 state = 1

1284 elif state == 3: # in both

1285 state = 0

1286 part = part[5:]

1287 else: # in nothing

1288 part = part[5:]

1289 state = 3

1290 elif part.startswith("'''"):

1291 if state == 1: # in italic

1292 if bold_follows(parts, i):

1293 part = part[3:]

1294 state = 3

1295 else:

1296 part = part[2:]

1297 state = 0

1298 elif state == 2: # in bold

1299 part = part[3:]

1300 state = 0

1301 elif state == 3: # in both

1302 part = part[3:]

1303 state = 1

1304 else: # in nothing

1305 part = part[3:]

1306 state = 2

1307 elif part.startswith("''"): 1307 ↛ 1320line 1307 didn't jump to line 1320 because the condition on line 1307 was always true

1308 if state == 1: # in italic

1309 part = part[2:]

1310 state = 0

1311 elif state == 2: # in bold

1312 part = part[2:]

1313 state = 3

1314 elif state == 3: # in both

1315 part = part[2:]

1316 state = 2

1317 else: # in nothing

1318 part = part[2:]

1319 state = 1

1320 if part:

1321 new_text_parts.append(part)

1322 continue

1323 new_text_parts.append(part)

1324 new_text_parts.append("\n")

1325 new_text_parts = new_text_parts[:-1] # remove last \n

1326 return "".join(new_text_parts)

1327

1328

1329# regex to find File/Image link attributes that would mean an image

1330# is *not* inline

1332

1333

1334URL_STARTS_RE = re.compile(

1335 r"({})".format(r"|".join(URL_STARTS)), flags=re.IGNORECASE

1336)

1337

1338IMAGE_LINK_RE: Optional[re.Pattern] = None

1339

1340

1341def clean_value(

1342 wxr: WiktextractContext, title: str, no_strip=False, no_html_strip=False

1343) -> str:

1344 """Cleans a title or value into a normal string. This should basically

1345 remove any Wikimedia formatting from it: HTML tags, templates, links,

1346 emphasis, etc. This will also merge multiple whitespaces into one

1347 normal space and will remove any surrounding whitespace."""

1348 assert isinstance(wxr, WiktextractContext)

1349 assert isinstance(title, str)

1350

1351 global IMAGE_LINK_RE

1352 if IMAGE_LINK_RE is None:

1353 image_link_prefixes = wxr.wtp.namespace_prefixes(

1354 wxr.wtp.NAMESPACE_DATA["File"]["id"], suffix=""

1355 )

1356 IMAGE_LINK_RE = re.compile(

1357 rf"(?:{'|'.join(image_link_prefixes)})\s*:", re.IGNORECASE

1358 )

1359

1360 def repl_1(m: re.Match) -> str:

1361 return clean_value(wxr, m.group(1), no_strip=True)

1362

1363 def repl_exturl(m: re.Match) -> str:

1364 args = re.split(r"\s+", m.group(1))

1365 i = 0

1366 while i < len(args) - 1:

1367 if not URL_STARTS_RE.match(args[i]):

1368 break

1369 i += 1

1370 return " ".join(args[i:])

1371

1372 def repl_link(m: re.Match) -> str:

1373 before_colon = m.group(1)

1374 after_colon = m.group(3)

1375 if (

1376 before_colon is not None

1377 and IMAGE_LINK_RE.match(before_colon) is not None

1378 ):

1379 return ""

1380 if before_colon is not None and before_colon.strip(": ") in ("w", "s"):

1381 # Wikipedia or Wikisource link

1382 v = after_colon.split("|")[0]

1383 else:

1384 v = m.group(0).strip("[] ").split("|")[0]

1385 return clean_value(wxr, v, no_strip=True)

1386

1387 def repl_link_bars(m: re.Match) -> str:

1388 link = m.group(1)

1389 if IMAGE_LINK_RE.match(link) is not None:

1390 # Handle File / Image / Fichier 'links' here.

1391 if NOT_INLINE_IMG_RE.match(m.group(0)) is None and "alt" in m.group(

1392 0

1393 ):

1394 # This image should be inline, so let's print its alt text

1395 alt_m = re.search(r"\|\s*alt\s*=([^]|]+)(\||\]\])", m.group(0))

1396 if alt_m is not None: 1396 ↛ 1398line 1396 didn't jump to line 1398 because the condition on line 1396 was always true

1397 return "[Alt: " + alt_m.group(1) + "]"

1398 return ""

1399 # m.group(5) is always the last matching group because you can

1400 # only access the last matched group; the indexes don't 'grow'

1401 return clean_value(wxr, m.group(5) or m.group(2) or "", no_strip=True)

1402

1403 def repl_1_sup(m: re.Match) -> str:

1404 return to_superscript(clean_value(wxr, m.group(1)))

1405

1406 def repl_1_sub(m: re.Match) -> str:

1407 return to_subscript(clean_value(wxr, m.group(1)))

1408

1409 def repl_1_chem(m: re.Match) -> str:

1410 return to_chem(clean_value(wxr, m.group(1)))

1411

1412 def repl_1_math(m: re.Match) -> str:

1413 v = to_math(m.group(1))

1414 # print("to_math:", ascii(v))

1415 return v

1416

1417 def repl_1_syntaxhighlight(m: re.Match) -> str:

1418 # Content is preformatted

1419 return "\n" + m.group(1).strip() + "\n"

1420

1421 # remove nowiki tag returned from `Wtp.node_to_html()`

1422 title = re.sub(r"<nowiki\s*/>", "", title)

1423

1424 # Remove any remaining templates

1425 # title = re.sub(r"\{\{[^}]+\}\}", "", title)

1426

1427 # Remove tables, which can contain other tables

1428 prev = ""

1429 while title != prev:

1430 prev = title

1431 title = re.sub(

1432 r"\{\|((?!\{\|)(?!\|\}).)*\|\}",

1433 "\n",

1434 title,

1435 flags=re.DOTALL,

1436 )

1437 # title = re.sub(r"(?s)\{\|.*?\|\}", "\n", title)

1438 # Remove second reference tags (<ref name="ref_name"/>)

1439 title = re.sub(r"<ref\s+name=\"[^\"]+\"\s*/>", "", title)

1440 # Remove references (<ref>...</ref>).

1441 title = re.sub(r"(?is)<ref\b\s*[^>/]*?>\s*.*?</ref\s*>", "", title)

1442 # Replace <span>...</span> by stripped content without newlines

1443 title = re.sub(

1444 r"(?is)<span\b\s*[^>]*?>(.*?)\s*</span\s*>",

1445 lambda m: re.sub(r"\s+", " ", m.group(1)),

1446 title,

1447 )

1448 # Replace <br/> by comma space (it is used to express alternatives in some

1449 # declensions)

1450 title = re.sub(r"(?si)\s*<br\s*/?>\n*", "\n", title)

1451 # Remove divs with floatright class (generated e.g. by {{ja-kanji|...}})

1452 title = re.sub(

1453 r'(?si)<div\b[^>]*?\bclass="[^"]*?\bfloatright\b[^>]*?>'

1454 r"((<div\b(<div\b.*?</div\s*>|.)*?</div>)|.)*?"

1455 r"</div\s*>",

1456 "",

1457 title,

1458 )

1459 # Remove divs with float: attribute

1460 title = re.sub(

1461 r'(?si)<div\b[^>]*?\bstyle="[^"]*?\bfloat:[^>]*?>'

1462 r"((<div\b(<div\b.*?</div\s*>|.)*?</div>)|.)*?"

1463 r"</div\s*>",

1464 "",

1465 title,

1466 )

1467 # Remove <sup> with previewonly class (generated e.g. by {{taxlink|...}})

1468 title = re.sub(

1469 r'(?si)<sup\b[^>]*?\bclass="[^"<>]*?'

1470 r"\bpreviewonly\b[^>]*?>"

1471 r".+?</sup\s*>",

1472 "",

1473 title,

1474 )

1475 # Remove <strong class="error">...</strong>

1476 title = re.sub(

1477 r'(?si)<strong\b[^>]*?\bclass="[^"]*?\berror\b[^>]*?>'

1478 r".+?</strong\s*>",

1479 "",

1480 title,

1481 )

1482 # Change <div> and </div> to newlines. Ditto for tr, li, table, dl, ul, ol

1483 title = re.sub(r"(?si)</?(div|tr|li|table|dl|ul|ol)\b[^>]*>", "\n", title)

1484 # Change <dt>, <dd>, </dt> and </dd> into newlines;

1485 # these generate new rows/lines.

1486 title = re.sub(r"(?i)</?d[dt]\s*>", "\n", title)

1487 # Change <td> </td> to spaces. Ditto for th.

1488 title = re.sub(r"(?si)</?(td|th)\b[^>]*>", " ", title)

1489 # Change <sup> ... </sup> to ^

1490 title = re.sub(r"(?si)<sup\b[^>]*>\s*</sup\s*>", "", title)

1491 title = re.sub(r"(?si)<sup\b[^>]*>(.*?)</sup\s*>", repl_1_sup, title)

1492 # Change <sub> ... </sub> to _

1493 title = re.sub(r"(?si)<sub\b[^>]*>\s*</sub\s*>", "", title)

1494 title = re.sub(r"(?si)<sub\b[^>]*>(.*?)</sub\s*>", repl_1_sub, title)

1495 # Change <chem> ... </chem> using subscripts for digits

1496 title = re.sub(r"(?si)<chem\b[^>]*>(.*?)</chem\s*>", repl_1_chem, title)

1497 # Change <math> ... </math> using special formatting.

1498 title = re.sub(r"(?si)<math\b[^>]*>(.*?)</math\s*>", repl_1_math, title)

1499 # Change <syntaxhighlight> ... </syntaxhighlight> using special formatting.

1500 title = re.sub(

1501 r"(?si)<syntaxhighlight\b[^>]*>(.*?)" r"</syntaxhighlight\s*>",

1502 repl_1_syntaxhighlight,

1503 title,

1504 )

1505 # Remove any remaining HTML tags.

1506 if not no_html_strip:

1507 title = re.sub(r"(?s)<[/!a-zA-Z][^>]*>", "", title)

1508 title = re.sub(r"(?s)</[^>]+>", "", title)

1509 else:

1510 # Strip <noinclude/> anyway

1511 title = re.sub(r"(?si)<noinclude\s*/\s*>", "", title)

1512 # Replace [...]

1513 title = re.sub(r"(?s)\[\s*\.\.\.\s*\]", "…", title)

1514 # Remove http links in superscript

1515 title = re.sub(r"\^$\[?(https?:)?//[^]()]+\]?$", "", title)

1516 # Remove any edit links to local pages

1517 title = re.sub(r"\[//[^]\s]+\s+edit\s*\]", "", title)

1518 # Replace links by their text

1519

1520 category_ns_data: NamespaceDataEntry

1521 # XXX "Category" -> config variable for portability

1522 category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item]

1523 # Fail if we received empty dict from .get()

1524 category_ns_names = {"Category", category_ns_data["name"]} | set(

1525 category_ns_data["aliases"]

1526 )

1527 category_names_pattern = rf"(?:{'|'.join(category_ns_names)})"

1528 while True:

1529 # Links may be nested, so keep replacing until there is no more change.

1530 orig = title

1531 title = re.sub(

1532 rf"(?si)\s*\[\[\s*{category_names_pattern}\s*:\s*([^]]+?)\s*\]\]",

1533 "",

1534 title,

1535 )

1536 title = re.sub(

1537 r"(?s)\[\[\s*:?([^]|#<>:&]+?)\s*(#[^][|<>]*?)?\]\]", repl_1, title

1538 )

1539 title = re.sub(

1540 r"(?s)\[\[\s*(([\w\d]+)\s*:)?\s*([^][#|<>]+?)"

1541 r"\s*(#[^][|]*?)?\|?\]\]",

1542 repl_link,

1543 title,

1544 )

1545 title = re.sub(

1546 r"(?s)\[\[\s*([^][|<>]+?)\s*\|"

1547 r"\s*(([^][|]|\[[^]]*\])+?)"

1548 r"(\s*\|\s*(([^][|]|\[[^]]*\])+?))*\s*\|*\]\]",

1549 repl_link_bars,

1550 title,

1551 )

1552 if title == orig:

1553 break

1554 # Replace remaining HTML links by the URL.

1555 while True:

1556 orig = title

1557 title = re.sub(

1558 r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]", repl_exturl, title

1559 )

1560 if title == orig:

1561 break

1562

1563 # Remove italic and bold

1564 title = remove_italic_and_bold(title)

1565

1566 # Replace HTML entities

1567 title = html.unescape(title)

1568 title = title.replace("\xa0", " ") # nbsp

1569 # Remove left-to-right and right-to-left, zero-with characters

1570 title = re.sub(r"[\u200e\u200f\u200b\u200d\u200c\ufeff]", "", title)

1571 # Replace whitespace sequences by a single space.

1572 # https://en.wikipedia.org/wiki/En_(typography)

1573 title = re.sub(r"[ \t\r\u2002]+", " ", title)

1574 title = re.sub(r" *\n+", "\n", title)

1575 # Eliminate spaces around ellipsis in brackets

1576 title = re.sub(r"\[\s*…\s*\]", "[…]", title)

1577

1578 # This unicode quote seems to be used instead of apostrophe quite randomly

1579 # (about 4% of apostrophes in English entries, some in Finnish entries).

1580 # title = re.sub("\u2019", "'", title) # Note: no r"..." here!

1581 # Replace strange unicode quotes with normal quotes

1582 # title = re.sub(r"”", '"', title)

1583 # Replace unicode long dash by normal dash

1584 # title = re.sub(r"–", "-", title)

1585

1586 # Remove whitespace before periods and commas etc

1587 # XXX we might re-enable this, now trying without as it is removing some

1588 # instances where we would want to leave the space

1589 # title = re.sub(r" ([.,;:!?)])", repl_1, title)

1590 # Strip surrounding whitespace.

1591 if not no_strip:

1592 title = title.strip()

1593 # Normalize different ways of writing accents into the NFC canonical form

1594 title = unicodedata.normalize("NFC", title)

1595 return title

1596

1597

1598def clean_template_args(

1599 wxr: WiktextractContext,

1600 ht: Union[TemplateArgs, TemplateParameters],

1601 no_strip=False,

1602) -> dict[Union[str, int], str]:

1603 """Cleans all values in a template argument dictionary and returns the

1604 cleaned dictionary."""

1605 assert isinstance(wxr, WiktextractContext)

1606 assert isinstance(ht, dict)

1607 return {

1608 clean_value(wxr, str(k), no_html_strip=True): clean_value(

1609 wxr, str(v), no_strip=no_strip, no_html_strip=True

1610 )

1611 for k, v in ht.items()

1612 }