Coverage for src/wiktextract/clean.py: 90%
320 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
1# This file contains code to clean Wiktionary annotations from a string and to
2# produce plain text from it, typically for glossary entries but this is also
3# called for various other data to produce clean strings.
4#
5# This file also contains code for cleaning qualifiers for the "tags" field.
6#
7# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
9import html
10import re
11import unicodedata
12from typing import Callable, Optional, Union
14from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST, URL_STARTS
15from wikitextprocessor.core import NamespaceDataEntry, TemplateArgs
16from wikitextprocessor.parser import TemplateParameters
18from .wxr_context import WiktextractContext
20######################################################################
21# Cleaning values into plain text.
22######################################################################
24superscript_ht: dict[str, str] = {
25 "0": "⁰",
26 "1": "¹",
27 "2": "²",
28 "3": "³",
29 "4": "⁴",
30 "5": "⁵",
31 "6": "⁶",
32 "7": "⁷",
33 "8": "⁸",
34 "9": "⁹",
35 "+": "⁺",
36 "-": "⁻",
37 "−": "⁻",
38 "‐": "⁻",
39 "–": "⁻",
40 "—": "⁻",
41 "一": "⁻",
42 "=": "⁼",
43 "(": "⁽",
44 ")": "⁾",
45 "A": "ᴬ",
46 "B": "ᴮ",
47 "D": "ᴰ",
48 "E": "ᴱ",
49 "G": "ᴳ",
50 "H": "ᴴ",
51 "I": "ᴵ",
52 "J": "ᴶ",
53 "K": "ᴷ",
54 "L": "ᴸ",
55 "M": "ᴹ",
56 "N": "ᴺ",
57 "O": "ᴼ",
58 "P": "ᴾ",
59 "R": "ᴿ",
60 "T": "ᵀ",
61 "U": "ᵁ",
62 "V": "ⱽ",
63 "W": "ᵂ",
64 "a": "ᵃ",
65 "b": "ᵇ",
66 "c": "ᶜ",
67 "d": "ᵈ",
68 "e": "ᵉ",
69 "f": "ᶠ",
70 "g": "ᵍ",
71 "h": "ʰ",
72 "i": "ⁱ",
73 "j": "ʲ",
74 "k": "ᵏ",
75 "l": "ˡ",
76 "m": "ᵐ",
77 "n": "ⁿ",
78 "o": "ᵒ",
79 "p": "ᵖ",
80 "r": "ʳ",
81 "s": "ˢ",
82 "t": "ᵗ",
83 "u": "ᵘ",
84 "v": "ᵛ",
85 "w": "ʷ",
86 "x": "ˣ",
87 "y": "ʸ",
88 "z": "ᶻ",
89 "β": "ᵝ",
90 "γ": "ᵞ",
91 "δ": "ᵟ",
92 "θ": "ᶿ",
93 "ι": "ᶥ",
94 "φ": "ᵠ",
95 "χ": "ᵡ",
96 "∞": "\u2002᪲", # This is a KLUDGE
97}
99subscript_ht: dict[str, str] = {
100 "0": "₀",
101 "1": "₁",
102 "2": "₂",
103 "3": "₃",
104 "4": "₄",
105 "5": "₅",
106 "6": "₆",
107 "7": "₇",
108 "8": "₈",
109 "9": "₉",
110 "+": "₊",
111 "-": "₋",
112 "−": "₋",
113 "=": "₌",
114 "(": "₍",
115 ")": "₎",
116 "a": "ₐ",
117 "e": "ₑ",
118 "h": "ₕ",
119 "i": "ᵢ",
120 "j": "ⱼ",
121 "k": "ₖ",
122 "l": "ₗ",
123 "m": "ₘ",
124 "n": "ₙ",
125 "o": "ₒ",
126 "p": "ₚ",
127 "r": "ᵣ",
128 "s": "ₛ",
129 "t": "ₜ",
130 "u": "ᵤ",
131 "v": "ᵥ",
132 "x": "ₓ",
133 "ə": "ₔ",
134 "ρ": "ᵨ",
135 "φ": "ᵩ",
136 "χ": "ᵪ",
137}
140def to_superscript(text: str) -> str:
141 "Converts text to superscript."
142 if not text: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true
143 return ""
144 if all(x in superscript_ht for x in text):
145 return "".join(superscript_ht[x] for x in text)
146 if len(text) == 1:
147 return "^" + text
148 return "^({})".format(text)
151def to_subscript(text: str) -> str:
152 """Converts text to subscript."""
153 if not text: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true
154 return ""
155 if all(x in subscript_ht for x in text):
156 return "".join(subscript_ht[x] for x in text)
157 if len(text) == 1: 157 ↛ 159line 157 didn't jump to line 159 because the condition on line 157 was always true
158 return "_" + text
159 return "_({})".format(text)
162def to_chem(text: str) -> str:
163 """Converts text to chemical formula, making digits subscript."""
164 return "".join(to_subscript(x) if x.isdigit() else x for x in text)
167# Mapping from Latex names to Unicode characters/strings. This is the
168# default mapping (some cases are handled specially in the code).
169math_map: dict[str, str] = {
170 # XXX should probably change greek characters to non-slanted ones?
171 "AC": "∿",
172 "APLcomment": "⍝",
173 "APLdownarrowbox": "⍗",
174 "APLinput": "⍞",
175 "APLinv": "⌹",
176 "APLleftarrowbox": "⍇",
177 "APLlog": "⍟",
178 "APLrightarrowbox": "⍈",
179 "APLuparrowbox": "⍐",
180 "Angstroem": "Å",
181 "Bot": "⫫",
182 "Box": "□",
183 "Bumpeq": "≎",
184 "CIRCLE": "●",
185 "Cap": "⋒",
186 "CapitalDifferentialD": "ⅅ",
187 "CheckedBox": "☑",
188 "Circle": "○",
189 "Coloneqq": "⩴",
190 "ComplexI": "ⅈ",
191 "ComplexJ": "ⅉ",
192 "Cup": "⋓",
193 "Delta": "Δ",
194 "Diamond": "◇",
195 "Diamondblack": "◆",
196 "Diamonddot": "⟐",
197 "DifferentialD": "ⅆ",
198 "Digamma": "Ϝ",
199 "Doteq": "≑",
200 "DownArrowBar": "⤓",
201 "DownLeftTeeVector": "⥞",
202 "DownLeftVectorBar": "⥖",
203 "DownRightTeeVector": "⥟",
204 "DownRightVectorBar": "⥗",
205 "Downarrow": "⇓",
206 "Equal": "⩵",
207 "Euler": "Ɛ",
208 "ExponentialE": "ⅇ",
209 "ExponetialE": "ⅇ",
210 "Finv": "Ⅎ",
211 "Gamma": "Γ",
212 "Im": "ℑ",
213 "Join": "⨝",
214 "Koppa": "Ϟ",
215 "LEFTCIRCLE": "◖",
216 "LEFTcircle": "◐",
217 "LHD": "◀",
218 "LVec": "x⃖",
219 "Lambda": "Λ",
220 "Lbag": "⟅",
221 "LeftArrowBar": "⇤",
222 "LeftDownTeeVector": "⥡",
223 "LeftDownVectorBar": "⥙",
224 "LeftTeeVector": "⥚",
225 "LeftTriangleBar": "⧏",
226 "LeftUpTeeVector": "⥠",
227 "LeftUpVectorBar": "⥘",
228 "LeftVectorBar": "⥒",
229 "Leftarrow": "⇐",
230 "Leftrightarrow": "⇔",
231 "Lleftarrow": "⇚",
232 "Longleftarrow": "⟸",
233 "Longleftrightarrow": "⟺",
234 "Longmapsfrom": "⟽",
235 "Longmapsto": "⟾",
236 "Longrightarrow": "⟹",
237 "Lparen": "⦅",
238 "Lsh": "↰",
239 "MapsDown": "↧",
240 "MapsUp": "↥",
241 "Mapsfrom": "⤆",
242 "Mapsto": "⤇",
243 "Micro": "µ",
244 "Nearrow": "⇗",
245 "NestedGreaterGreater": "⪢",
246 "NestedLessLess": "⪡",
247 "NotGreaterLess": "≹",
248 "NotGreaterTilde": "≵",
249 "NotLessTilde": "≴",
250 "Nwarrow": "⇖",
251 "Omega": "Ω",
252 "Phi": "Φ",
253 "Pi": "Π",
254 "Proportion": "∷",
255 "Psi": "Ψ",
256 "Qoppa": "Ϙ",
257 "RHD": "▶",
258 "RIGHTCIRCLE": "◗",
259 "RIGHTcircle": "◑",
260 "Rbag": "⟆",
261 "Re": "ℜ",
262 "RightArrowBar": "⇥",
263 "RightDownTeeVector": "⥝",
264 "RightDownVectorBar": "⥕",
265 "RightTeeVector": "⥛",
266 "RightTriangleBar": "⧐",
267 "RightUpTeeVector": "⥜",
268 "RightUpVectorBar": "⥔",
269 "RightVectorBar": "⥓",
270 "Rightarrow": "⇒",
271 "Rparen": "⦆",
272 "Rrightarrow": "⇛",
273 "Rsh": "↱",
274 "S": "§",
275 "Same": "⩶",
276 "Sampi": "Ϡ",
277 "Searrow": "⇘",
278 "Sigma": "Σ",
279 "Square": "☐",
280 "Stigma": "Ϛ",
281 "Subset": "⋐",
282 "Sun": "☉",
283 "Supset": "⋑",
284 "Swarrow": "⇙",
285 "Theta": "Θ",
286 "Top": "⫪",
287 "UpArrowBar": "⤒",
288 "Uparrow": "⇑",
289 "Updownarrow": "⇕",
290 "Upsilon": "Υ",
291 "VDash": "⊫",
292 "VERT": "⦀",
293 "Vdash": "⊩",
294 "Vert": "‖",
295 "Vvdash": "⊪",
296 "XBox": "☒",
297 "Xi": "Ξ",
298 "Yup": "⅄",
299 "_": "_",
300 "aleph": "א",
301 "alpha": "α",
302 "amalg": "⨿",
303 "anchor": "⚓",
304 "angle": "∠",
305 "approx": "≈",
306 "approxeq": "≊",
307 "aquarius": "♒",
308 "arg": "arg",
309 "aries": "♈",
310 "arrowbullet": "➢",
311 "ast": "∗",
312 "asymp": "≍",
313 "backepsilon": "϶",
314 "backprime": "‵",
315 "backsim": "∽",
316 "backsimeq": "⋍",
317 "backslash": "",
318 "ballotx": "✗",
319 "barin": "⋶",
320 "barleftharpoon": "⥫",
321 "barrightharpoon": "⥭",
322 "barwedge": "⊼",
323 "because": "∵",
324 "beta": "β",
325 "beth": "ב",
326 "between": "≬",
327 "bigcap": "∩",
328 "bigcup": "∪",
329 "biginterleave": "⫼",
330 "bigodot": "⨀",
331 "bigoplus": "⨁",
332 "bigotimes": "⨂",
333 "bigsqcap": "⨅",
334 "bigsqcup": "⨆",
335 "bigstar": "★",
336 "bigtriangledown": "▽",
337 "bigtriangleup": "△",
338 "biguplus": "⨄",
339 "bigvee": "∨",
340 "bigwedge": "∧",
341 "bij": "⤖",
342 "biohazard": "☣",
343 "blacklozenge": "⧫",
344 "blacksmiley": "☻",
345 "blacksquare": "■",
346 "blacktriangledown": "▾",
347 "blacktriangleleft": "◂",
348 "blacktriangleright": "▸",
349 "blacktriangleup": "▴",
350 "bot": "⊥",
351 "bowtie": "⋈",
352 "boxast": "⧆",
353 "boxbar": "◫",
354 "boxbox": "⧈",
355 "boxbslash": "⧅",
356 "boxcircle": "⧇",
357 "boxdot": "⊡",
358 "boxminus": "⊟",
359 "boxplus": "⊞",
360 "boxslash": "⧄",
361 "boxtimes": "⊠",
362 "bullet": "•",
363 "bumpeq": "≏",
364 "cancer": "♋",
365 "cap": "∩",
366 "capricornus": "♑",
367 "capwedge": "⩄",
368 "cat": "⁀",
369 "cdot": "·",
370 "cdots": "⋯",
371 "cent": "¢",
372 "checkmark": "✓",
373 "chi": "χ",
374 "circ": "∘",
375 "circeq": "≗",
376 "circlearrowleft": "↺",
377 "circlearrowright": "↻",
378 "circledR": "®",
379 "circledast": "⊛",
380 "circledbslash": "⦸",
381 "circledcirc": "⊚",
382 "circleddash": "⊝",
383 "circledgtr": "⧁",
384 "circledless": "⧀",
385 "clubsuit": "♣",
386 "colon": ":",
387 "coloneq": "≔",
388 "complement": "∁",
389 "cong": "≅",
390 "coprod": "∐",
391 "corresponds": "≙",
392 "cup": "∪",
393 "curlyeqprec": "⋞",
394 "curlyeqsucc": "⋟",
395 "curlyvee": "⋎",
396 "curlywedge": "⋏",
397 "curvearrowleft": "↶",
398 "curvearrowright": "↷",
399 "dagger": "†",
400 "daleth": "ד",
401 "dashleftarrow": "⇠",
402 "dashrightarrow": "⇢",
403 "dashv": "⊣",
404 "ddagger": "‡",
405 "delta": "δ",
406 "diameter": "∅",
407 "diamond": "⋄",
408 "diamondsuit": "♢",
409 "digamma": "ϝ",
410 "div": "÷",
411 "divideontimes": "⋇",
412 "dlsh": "↲",
413 "dot\\bigvee": "⩒",
414 "dot\\cap": "⩀",
415 "dot\\cup": "⊍",
416 "dot\\lor": "⩒",
417 "dot\\vee": "⩒",
418 "doteq": "≐",
419 "dotplus": "∔",
420 "dots": "…",
421 "doublebarwedge": "⩞",
422 "downarrow": "↓",
423 "downdownarrows": "⇊",
424 "downdownharpoons": "⥥",
425 "downharpoonleft": "⇃",
426 "downharpoonright": "⇂",
427 "downuparrows": "⇵",
428 "downupharpoons": "⥯",
429 "drsh": "↳",
430 "dsub": "⩤",
431 "earth": "♁",
432 "eighthnote": "♪",
433 "ell": "ℓ",
434 "emptyset": "∅",
435 "epsilon": "ϵ",
436 "eqcirc": "≖",
437 "eqcolon": "∹",
438 "eqsim": "≂",
439 "eqslantgtr": "⪖",
440 "eqslantless": "⪕",
441 "equiv": "≡",
442 "eta": "η",
443 "eth": "ð",
444 "exists": "∃",
445 "fallingdotseq": "≒",
446 "fcmp": "⨾",
447 "female": "♀",
448 "ffun": "⇻",
449 "finj": "⤕",
450 "fint": "⨏",
451 "flat": "♭",
452 "footnotesize": "",
453 "forall": "∀",
454 "fourth": "⁗",
455 "frown": "⌢",
456 "frownie": "☹",
457 "gamma": "γ",
458 "ge": ">",
459 "gemini": "♊",
460 "geq": "≥",
461 "geqq": "≧",
462 "geqslant": "⩾",
463 "gg": "≫",
464 "ggcurly": "⪼",
465 "ggg": "⋙",
466 "gimel": "ג",
467 "gnapprox": "⪊",
468 "gneq": "⪈",
469 "gneqq": "≩",
470 "gnsim": "⋧",
471 "gtrapprox": "⪆",
472 "gtrdot": "⋗",
473 "gtreqless": "⋛",
474 "gtreqqless": "⪌",
475 "gtrless": "≷",
476 "gtrsim": "≳",
477 "hash": "⋕",
478 "heartsuit": "♡",
479 "hookleftarrow": "↩",
480 "hookrightarrow": "↪",
481 "hslash": "ℏ",
482 "iddots": "⋰",
483 "iff": "⟺",
484 "iiiint": "⨌",
485 "iiint": "∭",
486 "iint": "∬",
487 "imath": "ı",
488 "implies": "⟹",
489 "in": "∈",
490 "infty": "∞",
491 "int": "∫",
492 "intercal": "⊺",
493 "interleave": "⫴",
494 "invamp": "⅋",
495 "invdiameter": "⍉",
496 "invneg": "⌐",
497 "iota": "ι",
498 "jmath": "ȷ",
499 "jupiter": "♃",
500 "kappa": "κ",
501 "koppa": "ϟ",
502 "lambda": "λ",
503 "land": "∧",
504 "lang": "⟪",
505 "langle": "⟨",
506 "large": "",
507 "lblot": "⦉",
508 "lbrace": "{",
509 "lbrack": "[",
510 "lceil": "⌈",
511 "ldots": "…",
512 "le": "<",
513 "leadsto": "⤳",
514 "leftarrow": "←",
515 "leftarrowtail": "↢",
516 "leftarrowtriangle": "⇽",
517 "leftbarharpoon": "⥪",
518 "leftharpoondown": "↽",
519 "leftharpoonup": "↼",
520 "leftleftarrows": "⇇",
521 "leftleftharpoons": "⥢",
522 "leftmoon": "☾",
523 "leftrightarrow": "↔",
524 "leftrightarrows": "⇆",
525 "leftrightarrowtriangle": "⇿",
526 "leftrightharpoon": "⥊",
527 "leftrightharpoondown": "⥐",
528 "leftrightharpoons": "⇋",
529 "leftrightharpoonup": "⥎",
530 "leftrightsquigarrow": "↭",
531 "leftslice": "⪦",
532 "leftsquigarrow": "⇜",
533 "leftthreetimes": "⋋",
534 "leftupdownharpoon": "⥑",
535 "leo": "♌",
536 "leq": "≤",
537 "leqq": "≦",
538 "leqslant": "⩽",
539 "lessapprox": "⪅",
540 "lessdot": "⋖",
541 "lesseqgtr": "⋚",
542 "lesseqqgtr": "⪋",
543 "lessgtr": "≶",
544 "lessim": "≲",
545 "lesssim": "≲",
546 "lfloor": "⌊",
547 "lgroup": "⟮",
548 "lhd": "◁",
549 "libra": "♎",
550 "lightning": "↯",
551 "limg": "⦇",
552 "ll": "≪",
553 "llbracket": "⟦",
554 "llcorner": "⌞",
555 "llcurly": "⪻",
556 "lll": "⋘",
557 "lnapprox": "⪉",
558 "lneq": "⪇",
559 "lneqq": "≨",
560 "lnot": "¬",
561 "lnsim": "⋦",
562 "longleftarrow": "⟵",
563 "longleftrightarrow": "⟷",
564 "longmapsfrom": "⟻",
565 "longmapsto": "⟼",
566 "longrightarrow": "⟶",
567 "looparrowleft": "↫",
568 "looparrowright": "↬",
569 "lor": "∨",
570 "lozenge": "◊",
571 "lrcorner": "⌟",
572 "ltimes": "⋉",
573 "male": "♂",
574 "maltese": "✠",
575 "mapsfrom": "↤",
576 "mapsto": "↦",
577 "measuredangle": "∡",
578 "medbullet": "⚫",
579 "medcirc": "⚪",
580 "mercury": "☿",
581 "mho": "℧",
582 "mid": "∣",
583 "mlcp": "⫛",
584 "mod": " mod ",
585 "models": "⊧",
586 "mp": "∓",
587 "mu": "μ",
588 "multimap": "⊸",
589 "multimapboth": "⧟",
590 "multimapdotbothA": "⊶",
591 "multimapdotbothB": "⊷",
592 "multimapinv": "⟜",
593 "nLeftarrow": "⇍",
594 "nLeftrightarrow": "⇎",
595 "nRightarrow": "⇏",
596 "nVDash": "⊯",
597 "nVdash": "⊮",
598 "nabla": "∇",
599 "napprox": "≉",
600 "natural": "♮",
601 "ncong": "≇",
602 "nearrow": "↗",
603 "neg": "¬",
604 "neptune": "♆",
605 "neq": "≠",
606 "nequiv": "≢",
607 "nexists": "∄",
608 "ngeq": "≱",
609 "ngtr": "≯",
610 "ni": "∋",
611 "nleftarrow": "↚",
612 "nleftrightarrow": "↮",
613 "nleq": "≰",
614 "nless": "≮",
615 "nmid": "∤",
616 "nni": "∌",
617 "normalsize": "",
618 "not\\in": "∉",
619 "not\\ni": "∌",
620 "not\\preceq": "⋠",
621 "not\\subset": "⊄",
622 "not\\subseteq": "⊈",
623 "not\\succeq": "⋡",
624 "not\\supset": "⊅",
625 "not\\supseteq": "⊉",
626 "not\\trianglelefteq": "⋬",
627 "not\\trianglerighteq": "⋭",
628 "not\\vartriangleleft": "⋪",
629 "not\\vartriangleright": "⋫",
630 "notasymp": "≭",
631 "notbackslash": "⍀",
632 "notin": "∉",
633 "notslash": "⌿",
634 "nparallel": "∦",
635 "nprec": "⊀",
636 "npreceq": "⋠",
637 "nrightarrow": "↛",
638 "nsim": "≁",
639 "nsimeq": "≄",
640 "nsqsubseteq": "⋢",
641 "nsqsupseteq": "⋣",
642 "nsubset": "⊄",
643 "nsubseteq": "⊈",
644 "nsucc": "⊁",
645 "nsucceq": "⋡",
646 "nsupset": "⊅",
647 "nsupseteq": "⊉",
648 "ntriangleleft": "⋪",
649 "ntrianglelefteq": "⋬",
650 "ntriangleright": "⋫",
651 "ntrianglerighteq": "⋭",
652 "nu": "ν",
653 "nvDash": "⊭",
654 "nvdash": "⊬",
655 "nwarrow": "↖",
656 "odot": "⊙",
657 "oiiint": "∰",
658 "oiint": "∯",
659 "oint": "∮",
660 "ointctrclockwise": "∳",
661 "omega": "ω",
662 "ominus": "⊖",
663 "oplus": "⊕",
664 "oslash": "⊘",
665 "otimes": "⊗",
666 "over": "/",
667 "overbrace": "⏞",
668 "overleftrightarrow": "x⃡",
669 "overparen": "⏜",
670 "overset?=": "≟",
671 "overset{?}{=}": "≟",
672 "overset{\\operatorname{def}}{=}": "≝",
673 "parallel": "∥",
674 "partial": "∂",
675 "pencil": "✎",
676 "perp": "⊥",
677 "pfun": "⇸",
678 "phi": "ϕ",
679 "pi": "π",
680 "pinj": "⤔",
681 "pisces": "♓",
682 "pitchfork": "⋔",
683 "pluto": "♇",
684 "pm": "±",
685 "pointright": "☞",
686 "pounds": "£",
687 "prec": "≺",
688 "precapprox": "⪷",
689 "preccurlyeq": "≼",
690 "preceq": "⪯",
691 "preceqq": "⪳",
692 "precnapprox": "⪹",
693 "precnsim": "⋨",
694 "precsim": "≾",
695 "prime": "′",
696 "prod": "∏",
697 "propto": "∝",
698 "psi": "ψ",
699 "psur": "⤀",
700 "qoppa": "ϙ",
701 "quad": " ",
702 "quarternote": "♩",
703 "radiation": "☢",
704 "rang": "⟫",
705 "rangle": "⟩",
706 "rarr": "→",
707 "rblot": "⦊",
708 "rbrace": "}",
709 "rbrack": "]",
710 "rceil": "⌉",
711 "recycle": "♻",
712 "rfloor": "⌋",
713 "rgroup": "⟯",
714 "rhd": "▷",
715 "rho": "ρ",
716 "rightangle": "∟",
717 "rightarrow": "→",
718 "rightarrowtail": "↣",
719 "rightarrowtriangle": "⇾",
720 "rightbarharpoon": "⥬",
721 "rightharpoondown": "⇁",
722 "rightharpoonup": "⇀",
723 "rightleftarrows": "⇄",
724 "rightleftharpoon": "⥋",
725 "rightleftharpoons": "⇌",
726 "rightmoon": "☽",
727 "rightrightarrows": "⇉",
728 "rightrightharpoons": "⥤",
729 "rightslice": "⪧",
730 "rightsquigarrow": "⇝",
731 "rightthreetimes": "⋌",
732 "rightupdownharpoon": "⥏",
733 "rimg": "⦈",
734 "risingdotseq": "≓",
735 "rrbracket": "⟧",
736 "rsub": "⩥",
737 "rtimes": "⋊",
738 "sagittarius": "♐",
739 "sampi": "ϡ",
740 "saturn": "♄",
741 "scorpio": "♏",
742 "scriptsize": "",
743 "searrow": "↘",
744 "second": "″",
745 "setminus": "⧵",
746 "sharp": "♯",
747 "sigma": "σ",
748 "sim": "∼",
749 "simeq": "≃",
750 "sixteenthnote": "♬",
751 "skull": "☠",
752 "slash": "∕",
753 "small": "",
754 "smallsetminus": "∖",
755 "smalltriangledown": "▿",
756 "smalltriangleleft": "◃",
757 "smalltriangleright": "▹",
758 "smalltriangleup": "▵",
759 "smile": "⌣",
760 "smiley": "☺",
761 "spadesuit": "♠",
762 "spddot": "¨",
763 "sphat": "^",
764 "sphericalangle": "∢",
765 "spot": "⦁",
766 "sptilde": "~",
767 "sqcap": "⊓",
768 "sqcup": "⊔",
769 "sqint": "⨖",
770 "sqrt": "√", # ∛ ∜ - partly special handling below
771 "sqrt[3]": "∛",
772 "sqrt[4]": "∜",
773 "sqsubset": "⊏",
774 "sqsubseteq": "⊑",
775 "sqsupset": "⊐",
776 "sqsupseteq": "⊒",
777 "square": "□",
778 "sslash": "⫽",
779 "star": "⋆",
780 "steaming": "☕",
781 "stigma": "ϛ",
782 "strictfi": "⥼",
783 "strictif": "⥽",
784 "subset": "⊂",
785 "subseteq": "⊆",
786 "subseteqq": "⫅",
787 "subsetneq": "⊊",
788 "subsetneqq": "⫋",
789 "succ": "≻",
790 "succapprox": "⪸",
791 "succcurlyeq": "≽",
792 "succeq": "⪰",
793 "succeqq": "⪴",
794 "succnapprox": "⪺",
795 "succnsim": "⋩",
796 "succsim": "≿",
797 "sum": "∑",
798 "sun": "☼",
799 "supset": "⊃",
800 "supseteq": "⊇",
801 "supseteqq": "⫆",
802 "supsetneq": "⊋",
803 "supsetneqq": "⫌",
804 "swarrow": "↙",
805 "swords": "⚔",
806 "talloblong": "⫾",
807 "tau": "τ",
808 "taurus": "♉",
809 "tcohm": "Ω",
810 "textbackslash": "\\",
811 "textbar": "|",
812 "textbullet": "•",
813 "textgreater": ">",
814 "textless": "<",
815 "textprime": "′",
816 "therefore": "∴",
817 "theta": "θ",
818 "third": "‴",
819 "times": "×",
820 "tiny": "",
821 "to": "→",
822 "top": "⊤",
823 "triangle": "∆",
824 "trianglelefteq": "⊴",
825 "triangleq": "≜",
826 "trianglerighteq": "⊵",
827 "twoheadleftarrow": "↞",
828 "twoheadrightarrow": "↠",
829 "twonotes": "♫",
830 "ulcorner": "⌜",
831 "underbar": " ̱",
832 "underbrace": "⏟",
833 "underleftarrow": "x⃮",
834 "underline": " ̲",
835 "underparen": "⏝",
836 "underrightarrow": "x⃯",
837 "uparrow": "↑",
838 "updownarrow": "↕",
839 "updownarrows": "⇅",
840 "updownharpoons": "⥮",
841 "upharpoonleft": "↿",
842 "upharpoonright": "↾",
843 "uplus": "⊎",
844 "upsilon": "υ",
845 "upuparrows": "⇈",
846 "upupharpoons": "⥣",
847 "uranus": "♅",
848 "urcorner": "⌝",
849 "utilde": " ̰",
850 "vDash": "⊨",
851 "varbeta": "β",
852 "varclubsuit": "♧",
853 "vardiamondsuit": "♦",
854 "varepsilon": "ε",
855 "varheartsuit": "♥",
856 "varkappa": "ϰ",
857 "varnothing": "∅",
858 "varointclockwise": "∲",
859 "varphi": "φ",
860 "varpi": "ϖ",
861 "varprod": "⨉",
862 "varrho": "ϱ",
863 "varsigma": "ς",
864 "varspadesuit": "♤",
865 "vartheta": "θ",
866 "vartriangleleft": "⊲",
867 "vartriangleright": "⊳",
868 "vdash": "⊢",
869 "vdots": "⋮",
870 "vee": "∨",
871 "veebar": "⊻",
872 "vert": "|",
873 "virgo": "♍",
874 "warning": "⚠",
875 "wasylozenge": "⌑",
876 "wedge": "∧",
877 "widehat=": "≙",
878 "widehat{=}": "≙",
879 "wp": "℘",
880 "wr": "≀",
881 "xi": "ξ",
882 "yen": "¥",
883 "yinyang": "☯",
884 "zcmp": "⨟",
885 "zeta": "ζ",
886 "zhide": "⧹",
887 "zpipe": "⨠",
888 "zproject": "⨡",
889 "|": "‖",
890 # Accents XXX these really should be handled specially with diacritics
891 # after argument
892 "acute": "́",
893 "bar": "̄",
894 "breve": "̆",
895 "check": "̌",
896 "ddddot": "⃜",
897 "dddot": "⃛",
898 "ddot": "̈",
899 "ddots": "⋱",
900 "dot": "̇",
901 "grave": "̀",
902 "hat": "̂",
903 "lvec": "⃐",
904 "mathring": "̊",
905 "not": "̸",
906 "overline": "◌̅",
907 "tilde": "̃",
908 "vec": "⃑",
909 # Some ignored operators
910 "bigl": "",
911 "bigr": "",
912 "left": "",
913 "right": "",
914 "style": "",
915 "textstyle": "",
916 "mathrm": "",
917}
919mathcal_map: dict[str, str] = {
920 "A": "𝒜",
921 "B": "ℬ",
922 "C": "𝒞",
923 "D": "𝒟",
924 "E": "ℰ",
925 "F": "ℱ",
926 "G": "𝒢",
927 "H": "ℋ",
928 "I": "ℐ",
929 "J": "𝒥",
930 "K": "𝒦",
931 "L": "ℒ",
932 "M": "ℳ",
933 "N": "𝒩",
934 "O": "𝒪",
935 "P": "𝒫",
936 "Q": "𝒬",
937 "R": "ℛ",
938 "S": "𝒮",
939 "T": "𝒯",
940 "U": "𝒰",
941 "V": "𝒱",
942 "W": "𝒲",
943 "X": "𝒳",
944 "Y": "𝒴",
945 "Z": "𝒵",
946 "a": "𝒶",
947 "b": "𝒷",
948 "c": "𝒸",
949 "d": "𝒹",
950 "e": "ℯ",
951 "f": "𝒻",
952 "g": "ℊ",
953 "h": "𝒽",
954 "i": "𝒾",
955 "j": "𝒿",
956 "k": "𝓀",
957 "l": "𝓁",
958 "m": "𝓂",
959 "n": "𝓃",
960 "o": "ℴ",
961 "p": "𝓅",
962 "q": "𝓆",
963 "r": "𝓇",
964 "s": "𝓈",
965 "t": "𝓉",
966 "u": "𝓊",
967 "v": "𝓋",
968 "w": "𝓌",
969 "x": "𝓍",
970 "y": "𝓎",
971 "z": "𝓏",
972}
974mathfrak_map: dict[str, str] = {
975 "A": "𝔄",
976 "B": "𝔅",
977 "C": "ℭ",
978 "D": "𝔇",
979 "E": "𝔈",
980 "F": "𝔉",
981 "G": "𝔊",
982 "H": "ℌ",
983 "J": "𝔍",
984 "K": "𝔎",
985 "L": "𝔏",
986 "M": "𝔐",
987 "N": "𝔑",
988 "O": "𝔒",
989 "P": "𝔓",
990 "Q": "𝔔",
991 "S": "𝔖",
992 "T": "𝔗",
993 "U": "𝔘",
994 "V": "𝔙",
995 "W": "𝔚",
996 "X": "𝔛",
997 "Y": "𝔜",
998 "Z": "ℨ",
999}
1001mathbb_map: dict[str, str] = {
1002 "A": "𝔸",
1003 "B": "𝔹",
1004 "C": "ℂ",
1005 "D": "𝔻",
1006 "E": "𝔼",
1007 "F": "𝔽",
1008 "G": "𝔾",
1009 "H": "ℍ",
1010 "I": "𝕀",
1011 "J": "𝕁",
1012 "K": "𝕂",
1013 "L": "𝕃",
1014 "M": "𝕄",
1015 "N": "ℕ",
1016 "O": "𝕆",
1017 "P": "ℙ",
1018 "Q": "ℚ",
1019 "R": "ℝ",
1020 "S": "𝕊",
1021 "T": "𝕋",
1022 "U": "𝕌",
1023 "V": "𝕍",
1024 "W": "𝕎",
1025 "X": "𝕏",
1026 "Y": "𝕐",
1027 "Z": "ℤ",
1028 "a": "𝕒",
1029 "b": "𝕓",
1030 "c": "𝕔",
1031 "d": "𝕕",
1032 "e": "𝕖",
1033 "f": "𝕗",
1034 "g": "𝕘",
1035 "h": "𝕙",
1036 "i": "𝕚",
1037 "j": "𝕛",
1038 "k": "𝕜",
1039 "l": "𝕝",
1040 "m": "𝕞",
1041 "n": "𝕟",
1042 "o": "𝕠",
1043 "p": "𝕡",
1044 "q": "𝕢",
1045 "r": "𝕣",
1046 "s": "𝕤",
1047 "t": "𝕥",
1048 "u": "𝕦",
1049 "v": "𝕧",
1050 "w": "𝕨",
1051 "x": "𝕩",
1052 "y": "𝕪",
1053 "z": "𝕫",
1054 "pi": "ℼ",
1055 "gamma": "ℽ",
1056 "Gamma": "ℾ",
1057 "Pi": "ℿ",
1058 "Sigma": "⅀",
1059 "0": "𝟘",
1060 "1": "𝟙",
1061 "2": "𝟚",
1062 "3": "𝟛",
1063 "4": "𝟜",
1064 "5": "𝟝",
1065 "6": "𝟞",
1066 "7": "𝟟",
1067 "8": "𝟠",
1068 "9": "𝟡",
1069}
1072def mathcal_fn(text: str) -> str:
1073 return "".join(mathcal_map.get(x, x) for x in text)
1076def mathfrak_fn(text: str) -> str:
1077 return "".join(mathfrak_map.get(x, x) for x in text)
1080def mathbb_fn(text: str) -> str:
1081 return "".join(mathbb_map.get(x, x) for x in text)
1084def to_math(text: str) -> str:
1085 """Converts a mathematical formula to ASCII."""
1086 # print("to_math: {!r}".format(text))
1087 magic_vec: list[str] = []
1089 def expand(text: str) -> str:
1090 while True:
1091 orig = text
1092 # formatting with {:c} converts input into character
1093 text = re.sub(
1094 r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST),
1095 lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST],
1096 text,
1097 )
1098 if text == orig:
1099 break
1100 return text
1102 def recurse(text: str) -> str:
1103 def math_magic(
1104 text: str, left: str, right: str, fn: Callable[[str], str]
1105 ) -> str:
1106 regexp_str = r"{}([^{}{}]+){}".format(
1107 re.escape(left),
1108 re.escape(left),
1109 re.escape(right),
1110 re.escape(right),
1111 )
1112 regexp = re.compile(regexp_str)
1114 def repl(m: re.Match) -> str:
1115 magic = chr(MAGIC_FIRST + len(magic_vec))
1116 t = fn(m.group(1)).strip()
1117 magic_vec.append(t)
1118 return magic
1120 while True:
1121 orig = text
1122 text = re.sub(regexp, repl, text)
1123 if text == orig:
1124 break
1125 return text
1127 def expand_group(v: str) -> str:
1128 fn: Optional[Callable[[str], str]] = None
1129 if re.match(r"\\mathcal\b", v):
1130 fn = mathcal_fn
1131 v = v[8:].strip()
1132 elif re.match(r"\\mathfrak\b", v):
1133 fn = mathfrak_fn
1134 v = v[9:].strip()
1135 elif re.match(r"\\mathbb\b", v):
1136 fn = mathbb_fn
1137 v = v[7:]
1138 elif re.match(r"\\(begin|end)\b", v): 1138 ↛ 1139line 1138 didn't jump to line 1139 because the condition on line 1138 was never true
1139 v = "" # Skip
1140 elif re.match(r"\\text\b", v): 1140 ↛ 1141line 1140 didn't jump to line 1141 because the condition on line 1140 was never true
1141 v = v[5:]
1142 elif re.match(r"\\pmod\b", v): 1142 ↛ 1143line 1142 didn't jump to line 1143 because the condition on line 1142 was never true
1143 v = v[5:].strip()
1144 v = "(mod " + expand_group(v) + ")"
1145 elif re.match(r"\\sqrt\[", v): 1145 ↛ 1146line 1145 didn't jump to line 1146 because the condition on line 1145 was never true
1146 a = v[6:-1].strip()
1147 if a == "2":
1148 v = "√"
1149 elif a == "3":
1150 v = "∛"
1151 elif a == "4":
1152 v = "∜"
1153 else:
1154 v = to_superscript(a) + "√"
1155 elif re.match(r"\\sqrt($|[0-9]|\b)", v): 1155 ↛ 1156line 1155 didn't jump to line 1156 because the condition on line 1155 was never true
1156 v = "√"
1157 elif re.match(r"\\(frac|binom)($|[0-9]|\b)", v):
1158 m = re.match(
1159 r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*"
1160 r"(\\[a-zA-Z]+|\\.|.)$",
1161 v,
1162 )
1163 if not m: 1163 ↛ 1164line 1163 didn't jump to line 1164 because the condition on line 1163 was never true
1164 print("MATH FRAC/BINOM ERROR: {!r}".format(v))
1165 return v
1166 op, a, b = m.groups()
1167 a = expand_group(a).strip()
1168 b = expand_group(b).strip()
1169 if len(a) > 1:
1170 a = "(" + a + ")"
1171 if len(b) > 1:
1172 b = "(" + b + ")"
1173 if op == "frac": 1173 ↛ 1175line 1173 didn't jump to line 1175 because the condition on line 1173 was always true
1174 v = a + "/" + b
1175 elif op == "binom":
1176 v = "binom({}, {})".format(a, b)
1177 else:
1178 # Should never get here
1179 v = "{}({})".format(op, v)
1180 elif v.startswith("_"):
1181 fn = to_subscript
1182 v = v[1:]
1183 elif v.startswith("^"):
1184 fn = to_superscript
1185 v = v[1:]
1186 if v.startswith("\\"):
1187 mapped = math_map.get(v[1:].strip())
1188 if mapped is None:
1189 if v[1:].strip().isalnum(): 1189 ↛ 1192line 1189 didn't jump to line 1192 because the condition on line 1189 was always true
1190 v = " " + v[1:].strip() + " "
1191 else:
1192 v = v[1:].strip()
1193 else:
1194 v = mapped
1195 elif v.isspace() or v in ("&",): # Ignore certain special chars 1195 ↛ 1196line 1195 didn't jump to line 1196 because the condition on line 1195 was never true
1196 v = ""
1197 if fn is not None:
1198 v = expand(v)
1199 v = fn(v)
1200 v = expand(v)
1201 return v
1203 parts: list[str] = []
1204 while True:
1205 orig = text
1206 text = math_magic(text, "{", "}", recurse)
1207 if text == orig:
1208 break
1209 for m in re.finditer(
1210 r"\s+|"
1211 r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*"
1212 r"(\\dot\\(bigvee|cup|cap|lor|vee)|"
1213 r"\\not\\(subset|supset|subseteq|supseteq|in|ni|"
1214 r"preceq|succeq|vartrianglelefteq|"
1215 r"vartrianglerighteq|trianglelefteq|"
1216 r"trianglerighteq)|"
1217 r"\\widehat\{=\}|\\widehat=|"
1218 r"\\overset\{?\}\{=\}|"
1219 r"\\overset\?=|"
1220 r"\\overset\{\\operatorname\{def\}\}\{=\}|"
1221 r"\\[a-zA-Z]+|\\.|.)|"
1222 r"(\\(mathcal|mathfrak|mathbb|text|begin|end|pmod)"
1223 r"\b\s*|"
1224 r"\\sqrt\b(\[\d+\])?)?"
1225 r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)",
1226 text,
1227 ):
1228 v = m.group(0).strip()
1229 if not v:
1230 continue
1231 v = expand_group(v)
1232 if v: 1232 ↛ 1209line 1232 didn't jump to line 1209 because the condition on line 1232 was always true
1233 if (
1234 parts and parts[-1][-1].isalpha() and v[0] in "0123456789"
1235 ) or (
1236 parts
1237 and parts[-1][-1] in "0123456789"
1238 and v[0] in "0123456789"
1239 ):
1240 v = " " + v
1241 parts.append(v)
1243 text = "".join(parts)
1244 return text
1246 text = recurse(text)
1247 # print("math text final: {!r}".format(text))
1248 return text
1251def bold_follows(parts: list[str], i: int) -> bool:
1252 """Checks if there is a bold (''') in parts after parts[i]. We allow
1253 intervening italics ('')."""
1254 parts = parts[i + 1 :]
1255 for p in parts:
1256 if not p.startswith("''"):
1257 continue
1258 if p.startswith("'''"):
1259 return True
1260 return False
1263def remove_italic_and_bold(text: str) -> str:
1264 """Based on token_iter in wikitextprocessor"""
1265 assert isinstance(text, str)
1266 lines = re.split(r"(\n+)", text) # Lines and separators
1267 parts_re = re.compile(r"(''+)")
1268 new_text_parts = []
1269 for line in lines:
1270 parts = re.split(parts_re, line)
1271 state = 0 # 1=in italic 2=in bold 3=in both
1272 for i, part in enumerate(parts):
1273 if part.startswith("''"):
1274 # This is a bold/italic part. Scan the rest of the line
1275 # to determine how it should be interpreted if there are
1276 # more than two apostrophes.
1277 if part.startswith("'''''"):
1278 if state == 1: # in italic
1279 part = part[5:]
1280 state = 2
1281 elif state == 2: # in bold
1282 part = part[5:]
1283 state = 1
1284 elif state == 3: # in both
1285 state = 0
1286 part = part[5:]
1287 else: # in nothing
1288 part = part[5:]
1289 state = 3
1290 elif part.startswith("'''"):
1291 if state == 1: # in italic
1292 if bold_follows(parts, i):
1293 part = part[3:]
1294 state = 3
1295 else:
1296 part = part[2:]
1297 state = 0
1298 elif state == 2: # in bold
1299 part = part[3:]
1300 state = 0
1301 elif state == 3: # in both
1302 part = part[3:]
1303 state = 1
1304 else: # in nothing
1305 part = part[3:]
1306 state = 2
1307 elif part.startswith("''"): 1307 ↛ 1320line 1307 didn't jump to line 1320 because the condition on line 1307 was always true
1308 if state == 1: # in italic
1309 part = part[2:]
1310 state = 0
1311 elif state == 2: # in bold
1312 part = part[2:]
1313 state = 3
1314 elif state == 3: # in both
1315 part = part[2:]
1316 state = 2
1317 else: # in nothing
1318 part = part[2:]
1319 state = 1
1320 if part:
1321 new_text_parts.append(part)
1322 continue
1323 new_text_parts.append(part)
1324 new_text_parts.append("\n")
1325 new_text_parts = new_text_parts[:-1] # remove last \n
1326 return "".join(new_text_parts)
1329# regex to find File/Image link attributes that would mean an image
1330# is *not* inline
1331NOT_INLINE_IMG_RE = re.compile(r"\|\s*(right|left|center|thumb|frame)\s*\|")
1334URL_STARTS_RE = re.compile(
1335 r"({})".format(r"|".join(URL_STARTS)), flags=re.IGNORECASE
1336)
1338IMAGE_LINK_RE: Optional[re.Pattern] = None
1341def clean_value(
1342 wxr: WiktextractContext, title: str, no_strip=False, no_html_strip=False
1343) -> str:
1344 """Cleans a title or value into a normal string. This should basically
1345 remove any Wikimedia formatting from it: HTML tags, templates, links,
1346 emphasis, etc. This will also merge multiple whitespaces into one
1347 normal space and will remove any surrounding whitespace."""
1348 assert isinstance(wxr, WiktextractContext)
1349 assert isinstance(title, str)
1351 global IMAGE_LINK_RE
1352 if IMAGE_LINK_RE is None:
1353 image_link_prefixes = wxr.wtp.namespace_prefixes(
1354 wxr.wtp.NAMESPACE_DATA["File"]["id"], suffix=""
1355 )
1356 IMAGE_LINK_RE = re.compile(
1357 rf"(?:{'|'.join(image_link_prefixes)})\s*:", re.IGNORECASE
1358 )
1360 def repl_1(m: re.Match) -> str:
1361 return clean_value(wxr, m.group(1), no_strip=True)
1363 def repl_exturl(m: re.Match) -> str:
1364 args = re.split(r"\s+", m.group(1))
1365 i = 0
1366 while i < len(args) - 1:
1367 if not URL_STARTS_RE.match(args[i]):
1368 break
1369 i += 1
1370 return " ".join(args[i:])
1372 def repl_link(m: re.Match) -> str:
1373 before_colon = m.group(1)
1374 after_colon = m.group(3)
1375 if (
1376 before_colon is not None
1377 and IMAGE_LINK_RE.match(before_colon) is not None
1378 ):
1379 return ""
1380 if before_colon is not None and before_colon.strip(": ") in ("w", "s"):
1381 # Wikipedia or Wikisource link
1382 v = after_colon.split("|")[0]
1383 else:
1384 v = m.group(0).strip("[] ").split("|")[0]
1385 return clean_value(wxr, v, no_strip=True)
1387 def repl_link_bars(m: re.Match) -> str:
1388 link = m.group(1)
1389 if IMAGE_LINK_RE.match(link) is not None:
1390 # Handle File / Image / Fichier 'links' here.
1391 if NOT_INLINE_IMG_RE.match(m.group(0)) is None and "alt" in m.group(
1392 0
1393 ):
1394 # This image should be inline, so let's print its alt text
1395 alt_m = re.search(r"\|\s*alt\s*=([^]|]+)(\||\]\])", m.group(0))
1396 if alt_m is not None: 1396 ↛ 1398line 1396 didn't jump to line 1398 because the condition on line 1396 was always true
1397 return "[Alt: " + alt_m.group(1) + "]"
1398 return ""
1399 # m.group(5) is always the last matching group because you can
1400 # only access the last matched group; the indexes don't 'grow'
1401 return clean_value(wxr, m.group(5) or m.group(2) or "", no_strip=True)
1403 def repl_1_sup(m: re.Match) -> str:
1404 return to_superscript(clean_value(wxr, m.group(1)))
1406 def repl_1_sub(m: re.Match) -> str:
1407 return to_subscript(clean_value(wxr, m.group(1)))
1409 def repl_1_chem(m: re.Match) -> str:
1410 return to_chem(clean_value(wxr, m.group(1)))
1412 def repl_1_math(m: re.Match) -> str:
1413 v = to_math(m.group(1))
1414 # print("to_math:", ascii(v))
1415 return v
1417 def repl_1_syntaxhighlight(m: re.Match) -> str:
1418 # Content is preformatted
1419 return "\n" + m.group(1).strip() + "\n"
1421 # remove nowiki tag returned from `Wtp.node_to_html()`
1422 title = re.sub(r"<nowiki\s*/>", "", title)
1424 # Remove any remaining templates
1425 # title = re.sub(r"\{\{[^}]+\}\}", "", title)
1427 # Remove tables, which can contain other tables
1428 prev = ""
1429 while title != prev:
1430 prev = title
1431 title = re.sub(
1432 r"\{\|((?!\{\|)(?!\|\}).)*\|\}",
1433 "\n",
1434 title,
1435 flags=re.DOTALL,
1436 )
1437 # title = re.sub(r"(?s)\{\|.*?\|\}", "\n", title)
1438 # Remove second reference tags (<ref name="ref_name"/>)
1439 title = re.sub(r"<ref\s+name=\"[^\"]+\"\s*/>", "", title)
1440 # Remove references (<ref>...</ref>).
1441 title = re.sub(r"(?is)<ref\b\s*[^>/]*?>\s*.*?</ref\s*>", "", title)
1442 # Replace <span>...</span> by stripped content without newlines
1443 title = re.sub(
1444 r"(?is)<span\b\s*[^>]*?>(.*?)\s*</span\s*>",
1445 lambda m: re.sub(r"\s+", " ", m.group(1)),
1446 title,
1447 )
1448 # Replace <br/> by comma space (it is used to express alternatives in some
1449 # declensions)
1450 title = re.sub(r"(?si)\s*<br\s*/?>\n*", "\n", title)
1451 # Remove divs with floatright class (generated e.g. by {{ja-kanji|...}})
1452 title = re.sub(
1453 r'(?si)<div\b[^>]*?\bclass="[^"]*?\bfloatright\b[^>]*?>'
1454 r"((<div\b(<div\b.*?</div\s*>|.)*?</div>)|.)*?"
1455 r"</div\s*>",
1456 "",
1457 title,
1458 )
1459 # Remove divs with float: attribute
1460 title = re.sub(
1461 r'(?si)<div\b[^>]*?\bstyle="[^"]*?\bfloat:[^>]*?>'
1462 r"((<div\b(<div\b.*?</div\s*>|.)*?</div>)|.)*?"
1463 r"</div\s*>",
1464 "",
1465 title,
1466 )
1467 # Remove <sup> with previewonly class (generated e.g. by {{taxlink|...}})
1468 title = re.sub(
1469 r'(?si)<sup\b[^>]*?\bclass="[^"<>]*?'
1470 r"\bpreviewonly\b[^>]*?>"
1471 r".+?</sup\s*>",
1472 "",
1473 title,
1474 )
1475 # Remove <strong class="error">...</strong>
1476 title = re.sub(
1477 r'(?si)<strong\b[^>]*?\bclass="[^"]*?\berror\b[^>]*?>'
1478 r".+?</strong\s*>",
1479 "",
1480 title,
1481 )
1482 # Change <div> and </div> to newlines. Ditto for tr, li, table, dl, ul, ol
1483 title = re.sub(r"(?si)</?(div|tr|li|table|dl|ul|ol)\b[^>]*>", "\n", title)
1484 # Change <dt>, <dd>, </dt> and </dd> into newlines;
1485 # these generate new rows/lines.
1486 title = re.sub(r"(?i)</?d[dt]\s*>", "\n", title)
1487 # Change <td> </td> to spaces. Ditto for th.
1488 title = re.sub(r"(?si)</?(td|th)\b[^>]*>", " ", title)
1489 # Change <sup> ... </sup> to ^
1490 title = re.sub(r"(?si)<sup\b[^>]*>\s*</sup\s*>", "", title)
1491 title = re.sub(r"(?si)<sup\b[^>]*>(.*?)</sup\s*>", repl_1_sup, title)
1492 # Change <sub> ... </sub> to _
1493 title = re.sub(r"(?si)<sub\b[^>]*>\s*</sub\s*>", "", title)
1494 title = re.sub(r"(?si)<sub\b[^>]*>(.*?)</sub\s*>", repl_1_sub, title)
1495 # Change <chem> ... </chem> using subscripts for digits
1496 title = re.sub(r"(?si)<chem\b[^>]*>(.*?)</chem\s*>", repl_1_chem, title)
1497 # Change <math> ... </math> using special formatting.
1498 title = re.sub(r"(?si)<math\b[^>]*>(.*?)</math\s*>", repl_1_math, title)
1499 # Change <syntaxhighlight> ... </syntaxhighlight> using special formatting.
1500 title = re.sub(
1501 r"(?si)<syntaxhighlight\b[^>]*>(.*?)" r"</syntaxhighlight\s*>",
1502 repl_1_syntaxhighlight,
1503 title,
1504 )
1505 # Remove any remaining HTML tags.
1506 if not no_html_strip:
1507 title = re.sub(r"(?s)<[/!a-zA-Z][^>]*>", "", title)
1508 title = re.sub(r"(?s)</[^>]+>", "", title)
1509 else:
1510 # Strip <noinclude/> anyway
1511 title = re.sub(r"(?si)<noinclude\s*/\s*>", "", title)
1512 # Replace [...]
1513 title = re.sub(r"(?s)\[\s*\.\.\.\s*\]", "…", title)
1514 # Remove http links in superscript
1515 title = re.sub(r"\^\(\[?(https?:)?//[^]()]+\]?\)", "", title)
1516 # Remove any edit links to local pages
1517 title = re.sub(r"\[//[^]\s]+\s+edit\s*\]", "", title)
1518 # Replace links by their text
1520 category_ns_data: NamespaceDataEntry
1521 # XXX "Category" -> config variable for portability
1522 category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item]
1523 # Fail if we received empty dict from .get()
1524 category_ns_names = {"Category", category_ns_data["name"]} | set(
1525 category_ns_data["aliases"]
1526 )
1527 category_names_pattern = rf"(?:{'|'.join(category_ns_names)})"
1528 while True:
1529 # Links may be nested, so keep replacing until there is no more change.
1530 orig = title
1531 title = re.sub(
1532 rf"(?si)\s*\[\[\s*{category_names_pattern}\s*:\s*([^]]+?)\s*\]\]",
1533 "",
1534 title,
1535 )
1536 title = re.sub(
1537 r"(?s)\[\[\s*:?([^]|#<>:&]+?)\s*(#[^][|<>]*?)?\]\]", repl_1, title
1538 )
1539 title = re.sub(
1540 r"(?s)\[\[\s*(([\w\d]+)\s*:)?\s*([^][#|<>]+?)"
1541 r"\s*(#[^][|]*?)?\|?\]\]",
1542 repl_link,
1543 title,
1544 )
1545 title = re.sub(
1546 r"(?s)\[\[\s*([^][|<>]+?)\s*\|"
1547 r"\s*(([^][|]|\[[^]]*\])+?)"
1548 r"(\s*\|\s*(([^][|]|\[[^]]*\])+?))*\s*\|*\]\]",
1549 repl_link_bars,
1550 title,
1551 )
1552 if title == orig:
1553 break
1554 # Replace remaining HTML links by the URL.
1555 while True:
1556 orig = title
1557 title = re.sub(
1558 r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]", repl_exturl, title
1559 )
1560 if title == orig:
1561 break
1563 # Remove italic and bold
1564 title = remove_italic_and_bold(title)
1566 # Replace HTML entities
1567 title = html.unescape(title)
1568 title = title.replace("\xa0", " ") # nbsp
1569 # Remove left-to-right and right-to-left, zero-with characters
1570 title = re.sub(r"[\u200e\u200f\u200b\u200d\u200c\ufeff]", "", title)
1571 # Replace whitespace sequences by a single space.
1572 # https://en.wikipedia.org/wiki/En_(typography)
1573 title = re.sub(r"[ \t\r\u2002]+", " ", title)
1574 title = re.sub(r" *\n+", "\n", title)
1575 # Eliminate spaces around ellipsis in brackets
1576 title = re.sub(r"\[\s*…\s*\]", "[…]", title)
1578 # This unicode quote seems to be used instead of apostrophe quite randomly
1579 # (about 4% of apostrophes in English entries, some in Finnish entries).
1580 # title = re.sub("\u2019", "'", title) # Note: no r"..." here!
1581 # Replace strange unicode quotes with normal quotes
1582 # title = re.sub(r"”", '"', title)
1583 # Replace unicode long dash by normal dash
1584 # title = re.sub(r"–", "-", title)
1586 # Remove whitespace before periods and commas etc
1587 # XXX we might re-enable this, now trying without as it is removing some
1588 # instances where we would want to leave the space
1589 # title = re.sub(r" ([.,;:!?)])", repl_1, title)
1590 # Strip surrounding whitespace.
1591 if not no_strip:
1592 title = title.strip()
1593 # Normalize different ways of writing accents into the NFC canonical form
1594 title = unicodedata.normalize("NFC", title)
1595 return title
1598def clean_template_args(
1599 wxr: WiktextractContext,
1600 ht: Union[TemplateArgs, TemplateParameters],
1601 no_strip=False,
1602) -> dict[Union[str, int], str]:
1603 """Cleans all values in a template argument dictionary and returns the
1604 cleaned dictionary."""
1605 assert isinstance(wxr, WiktextractContext)
1606 assert isinstance(ht, dict)
1607 return {
1608 clean_value(wxr, str(k), no_html_strip=True): clean_value(
1609 wxr, str(v), no_strip=no_strip, no_html_strip=True
1610 )
1611 for k, v in ht.items()
1612 }