Coverage for src/wiktextract/clean.py: 84%
320 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# This file contains code to clean Wiktionary annotations from a string and to
2# produce plain text from it, typically for glossary entries but this is also
3# called for various other data to produce clean strings.
4#
5# This file also contains code for cleaning qualifiers for the "tags" field.
6#
7# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
9import html
10import re
11import unicodedata
12from typing import Callable, Optional, Union
14from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST, URL_STARTS
15from wikitextprocessor.core import NamespaceDataEntry, TemplateArgs
16from wikitextprocessor.parser import TemplateParameters
18from .wxr_context import WiktextractContext
20######################################################################
21# Cleaning values into plain text.
22######################################################################
24superscript_ht: dict[str, str] = {
25 "0": "⁰",
26 "1": "¹",
27 "2": "²",
28 "3": "³",
29 "4": "⁴",
30 "5": "⁵",
31 "6": "⁶",
32 "7": "⁷",
33 "8": "⁸",
34 "9": "⁹",
35 "+": "⁺",
36 "-": "⁻",
37 "−": "⁻",
38 "‐": "⁻",
39 "–": "⁻",
40 "—": "⁻",
41 "一": "⁻",
42 "=": "⁼",
43 "(": "⁽",
44 ")": "⁾",
45 "A": "ᴬ",
46 "B": "ᴮ",
47 "D": "ᴰ",
48 "E": "ᴱ",
49 "G": "ᴳ",
50 "H": "ᴴ",
51 "I": "ᴵ",
52 "J": "ᴶ",
53 "K": "ᴷ",
54 "L": "ᴸ",
55 "M": "ᴹ",
56 "N": "ᴺ",
57 "O": "ᴼ",
58 "P": "ᴾ",
59 "R": "ᴿ",
60 "T": "ᵀ",
61 "U": "ᵁ",
62 "V": "ⱽ",
63 "W": "ᵂ",
64 "a": "ᵃ",
65 "b": "ᵇ",
66 "c": "ᶜ",
67 "d": "ᵈ",
68 "e": "ᵉ",
69 "f": "ᶠ",
70 "g": "ᵍ",
71 "h": "ʰ",
72 "i": "ⁱ",
73 "j": "ʲ",
74 "k": "ᵏ",
75 "l": "ˡ",
76 "m": "ᵐ",
77 "n": "ⁿ",
78 "o": "ᵒ",
79 "p": "ᵖ",
80 "r": "ʳ",
81 "s": "ˢ",
82 "t": "ᵗ",
83 "u": "ᵘ",
84 "v": "ᵛ",
85 "w": "ʷ",
86 "x": "ˣ",
87 "y": "ʸ",
88 "z": "ᶻ",
89 "β": "ᵝ",
90 "γ": "ᵞ",
91 "δ": "ᵟ",
92 "θ": "ᶿ",
93 "ι": "ᶥ",
94 "φ": "ᵠ",
95 "χ": "ᵡ",
96 "∞": "\u2002᪲", # This is a KLUDGE
97}
99subscript_ht: dict[str, str] = {
100 "0": "₀",
101 "1": "₁",
102 "2": "₂",
103 "3": "₃",
104 "4": "₄",
105 "5": "₅",
106 "6": "₆",
107 "7": "₇",
108 "8": "₈",
109 "9": "₉",
110 "+": "₊",
111 "-": "₋",
112 "−": "₋",
113 "=": "₌",
114 "(": "₍",
115 ")": "₎",
116 "a": "ₐ",
117 "e": "ₑ",
118 "h": "ₕ",
119 "i": "ᵢ",
120 "j": "ⱼ",
121 "k": "ₖ",
122 "l": "ₗ",
123 "m": "ₘ",
124 "n": "ₙ",
125 "o": "ₒ",
126 "p": "ₚ",
127 "r": "ᵣ",
128 "s": "ₛ",
129 "t": "ₜ",
130 "u": "ᵤ",
131 "v": "ᵥ",
132 "x": "ₓ",
133 "ə": "ₔ",
134 "ρ": "ᵨ",
135 "φ": "ᵩ",
136 "χ": "ᵪ",
137}
140def to_superscript(text: str) -> str:
141 "Converts text to superscript."
142 if not text: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true
143 return ""
144 if all(x in superscript_ht for x in text):
145 return "".join(superscript_ht[x] for x in text)
146 if len(text) == 1:
147 return "^" + text
148 return "^({})".format(text)
151def to_subscript(text: str) -> str:
152 """Converts text to subscript."""
153 if not text: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true
154 return ""
155 if all(x in subscript_ht for x in text): 155 ↛ 157line 155 didn't jump to line 157 because the condition on line 155 was always true
156 return "".join(subscript_ht[x] for x in text)
157 if len(text) == 1:
158 return "_" + text
159 return "_({})".format(text)
162def to_chem(text: str) -> str:
163 """Converts text to chemical formula, making digits subscript."""
164 return "".join(to_subscript(x) if x.isdigit() else x for x in text)
167# Mapping from Latex names to Unicode characters/strings. This is the
168# default mapping (some cases are handled specially in the code).
169math_map: dict[str, str] = {
170 # XXX should probably change greek characters to non-slanted ones?
171 "AC": "∿",
172 "APLcomment": "⍝",
173 "APLdownarrowbox": "⍗",
174 "APLinput": "⍞",
175 "APLinv": "⌹",
176 "APLleftarrowbox": "⍇",
177 "APLlog": "⍟",
178 "APLrightarrowbox": "⍈",
179 "APLuparrowbox": "⍐",
180 "Angstroem": "Å",
181 "Bot": "⫫",
182 "Box": "□",
183 "Bumpeq": "≎",
184 "CIRCLE": "●",
185 "Cap": "⋒",
186 "CapitalDifferentialD": "ⅅ",
187 "CheckedBox": "☑",
188 "Circle": "○",
189 "Coloneqq": "⩴",
190 "ComplexI": "ⅈ",
191 "ComplexJ": "ⅉ",
192 "Cup": "⋓",
193 "Delta": "Δ",
194 "Diamond": "◇",
195 "Diamondblack": "◆",
196 "Diamonddot": "⟐",
197 "DifferentialD": "ⅆ",
198 "Digamma": "Ϝ",
199 "Doteq": "≑",
200 "DownArrowBar": "⤓",
201 "DownLeftTeeVector": "⥞",
202 "DownLeftVectorBar": "⥖",
203 "DownRightTeeVector": "⥟",
204 "DownRightVectorBar": "⥗",
205 "Downarrow": "⇓",
206 "Equal": "⩵",
207 "Euler": "Ɛ",
208 "ExponentialE": "ⅇ",
209 "ExponetialE": "ⅇ",
210 "Finv": "Ⅎ",
211 "Gamma": "Γ",
212 "Im": "ℑ",
213 "Join": "⨝",
214 "Koppa": "Ϟ",
215 "LEFTCIRCLE": "◖",
216 "LEFTcircle": "◐",
217 "LHD": "◀",
218 "LVec": "x⃖",
219 "Lambda": "Λ",
220 "Lbag": "⟅",
221 "LeftArrowBar": "⇤",
222 "LeftDownTeeVector": "⥡",
223 "LeftDownVectorBar": "⥙",
224 "LeftTeeVector": "⥚",
225 "LeftTriangleBar": "⧏",
226 "LeftUpTeeVector": "⥠",
227 "LeftUpVectorBar": "⥘",
228 "LeftVectorBar": "⥒",
229 "Leftarrow": "⇐",
230 "Leftrightarrow": "⇔",
231 "Lleftarrow": "⇚",
232 "Longleftarrow": "⟸",
233 "Longleftrightarrow": "⟺",
234 "Longmapsfrom": "⟽",
235 "Longmapsto": "⟾",
236 "Longrightarrow": "⟹",
237 "Lparen": "⦅",
238 "Lsh": "↰",
239 "MapsDown": "↧",
240 "MapsUp": "↥",
241 "Mapsfrom": "⤆",
242 "Mapsto": "⤇",
243 "Micro": "µ",
244 "Nearrow": "⇗",
245 "NestedGreaterGreater": "⪢",
246 "NestedLessLess": "⪡",
247 "NotGreaterLess": "≹",
248 "NotGreaterTilde": "≵",
249 "NotLessTilde": "≴",
250 "Nwarrow": "⇖",
251 "Omega": "Ω",
252 "Phi": "Φ",
253 "Pi": "Π",
254 "Proportion": "∷",
255 "Psi": "Ψ",
256 "Qoppa": "Ϙ",
257 "RHD": "▶",
258 "RIGHTCIRCLE": "◗",
259 "RIGHTcircle": "◑",
260 "Rbag": "⟆",
261 "Re": "ℜ",
262 "RightArrowBar": "⇥",
263 "RightDownTeeVector": "⥝",
264 "RightDownVectorBar": "⥕",
265 "RightTeeVector": "⥛",
266 "RightTriangleBar": "⧐",
267 "RightUpTeeVector": "⥜",
268 "RightUpVectorBar": "⥔",
269 "RightVectorBar": "⥓",
270 "Rightarrow": "⇒",
271 "Rparen": "⦆",
272 "Rrightarrow": "⇛",
273 "Rsh": "↱",
274 "S": "§",
275 "Same": "⩶",
276 "Sampi": "Ϡ",
277 "Searrow": "⇘",
278 "Sigma": "Σ",
279 "Square": "☐",
280 "Stigma": "Ϛ",
281 "Subset": "⋐",
282 "Sun": "☉",
283 "Supset": "⋑",
284 "Swarrow": "⇙",
285 "Theta": "Θ",
286 "Top": "⫪",
287 "UpArrowBar": "⤒",
288 "Uparrow": "⇑",
289 "Updownarrow": "⇕",
290 "Upsilon": "Υ",
291 "VDash": "⊫",
292 "VERT": "⦀",
293 "Vdash": "⊩",
294 "Vert": "‖",
295 "Vvdash": "⊪",
296 "XBox": "☒",
297 "Xi": "Ξ",
298 "Yup": "⅄",
299 "_": "_",
300 "aleph": "א",
301 "alpha": "α",
302 "amalg": "⨿",
303 "anchor": "⚓",
304 "angle": "∠",
305 "approx": "≈",
306 "approxeq": "≊",
307 "aquarius": "♒",
308 "arg": "arg",
309 "aries": "♈",
310 "arrowbullet": "➢",
311 "ast": "∗",
312 "asymp": "≍",
313 "backepsilon": "϶",
314 "backprime": "‵",
315 "backsim": "∽",
316 "backsimeq": "⋍",
317 "backslash": "",
318 "ballotx": "✗",
319 "barin": "⋶",
320 "barleftharpoon": "⥫",
321 "barrightharpoon": "⥭",
322 "barwedge": "⊼",
323 "because": "∵",
324 "beta": "β",
325 "beth": "ב",
326 "between": "≬",
327 "bigcap": "∩",
328 "bigcup": "∪",
329 "biginterleave": "⫼",
330 "bigodot": "⨀",
331 "bigoplus": "⨁",
332 "bigotimes": "⨂",
333 "bigsqcap": "⨅",
334 "bigsqcup": "⨆",
335 "bigstar": "★",
336 "bigtriangledown": "▽",
337 "bigtriangleup": "△",
338 "biguplus": "⨄",
339 "bigvee": "∨",
340 "bigwedge": "∧",
341 "bij": "⤖",
342 "biohazard": "☣",
343 "blacklozenge": "⧫",
344 "blacksmiley": "☻",
345 "blacksquare": "■",
346 "blacktriangledown": "▾",
347 "blacktriangleleft": "◂",
348 "blacktriangleright": "▸",
349 "blacktriangleup": "▴",
350 "bot": "⊥",
351 "bowtie": "⋈",
352 "boxast": "⧆",
353 "boxbar": "◫",
354 "boxbox": "⧈",
355 "boxbslash": "⧅",
356 "boxcircle": "⧇",
357 "boxdot": "⊡",
358 "boxminus": "⊟",
359 "boxplus": "⊞",
360 "boxslash": "⧄",
361 "boxtimes": "⊠",
362 "bullet": "•",
363 "bumpeq": "≏",
364 "cancer": "♋",
365 "cap": "∩",
366 "capricornus": "♑",
367 "capwedge": "⩄",
368 "cat": "⁀",
369 "cdot": "·",
370 "cdots": "⋯",
371 "cent": "¢",
372 "checkmark": "✓",
373 "chi": "χ",
374 "circ": "∘",
375 "circeq": "≗",
376 "circlearrowleft": "↺",
377 "circlearrowright": "↻",
378 "circledR": "®",
379 "circledast": "⊛",
380 "circledbslash": "⦸",
381 "circledcirc": "⊚",
382 "circleddash": "⊝",
383 "circledgtr": "⧁",
384 "circledless": "⧀",
385 "clubsuit": "♣",
386 "colon": ":",
387 "coloneq": "≔",
388 "complement": "∁",
389 "cong": "≅",
390 "coprod": "∐",
391 "corresponds": "≙",
392 "cup": "∪",
393 "curlyeqprec": "⋞",
394 "curlyeqsucc": "⋟",
395 "curlyvee": "⋎",
396 "curlywedge": "⋏",
397 "curvearrowleft": "↶",
398 "curvearrowright": "↷",
399 "dagger": "†",
400 "daleth": "ד",
401 "dashleftarrow": "⇠",
402 "dashrightarrow": "⇢",
403 "dashv": "⊣",
404 "ddagger": "‡",
405 "delta": "δ",
406 "diameter": "∅",
407 "diamond": "⋄",
408 "diamondsuit": "♢",
409 "digamma": "ϝ",
410 "div": "÷",
411 "divideontimes": "⋇",
412 "dlsh": "↲",
413 "dot\\bigvee": "⩒",
414 "dot\\cap": "⩀",
415 "dot\\cup": "⊍",
416 "dot\\lor": "⩒",
417 "dot\\vee": "⩒",
418 "doteq": "≐",
419 "dotplus": "∔",
420 "dots": "…",
421 "doublebarwedge": "⩞",
422 "downarrow": "↓",
423 "downdownarrows": "⇊",
424 "downdownharpoons": "⥥",
425 "downharpoonleft": "⇃",
426 "downharpoonright": "⇂",
427 "downuparrows": "⇵",
428 "downupharpoons": "⥯",
429 "drsh": "↳",
430 "dsub": "⩤",
431 "earth": "♁",
432 "eighthnote": "♪",
433 "ell": "ℓ",
434 "emptyset": "∅",
435 "epsilon": "ϵ",
436 "eqcirc": "≖",
437 "eqcolon": "∹",
438 "eqsim": "≂",
439 "eqslantgtr": "⪖",
440 "eqslantless": "⪕",
441 "equiv": "≡",
442 "eta": "η",
443 "eth": "ð",
444 "exists": "∃",
445 "fallingdotseq": "≒",
446 "fcmp": "⨾",
447 "female": "♀",
448 "ffun": "⇻",
449 "finj": "⤕",
450 "fint": "⨏",
451 "flat": "♭",
452 "footnotesize": "",
453 "forall": "∀",
454 "fourth": "⁗",
455 "frown": "⌢",
456 "frownie": "☹",
457 "gamma": "γ",
458 "ge": ">",
459 "gemini": "♊",
460 "geq": "≥",
461 "geqq": "≧",
462 "geqslant": "⩾",
463 "gg": "≫",
464 "ggcurly": "⪼",
465 "ggg": "⋙",
466 "gimel": "ג",
467 "gnapprox": "⪊",
468 "gneq": "⪈",
469 "gneqq": "≩",
470 "gnsim": "⋧",
471 "gtrapprox": "⪆",
472 "gtrdot": "⋗",
473 "gtreqless": "⋛",
474 "gtreqqless": "⪌",
475 "gtrless": "≷",
476 "gtrsim": "≳",
477 "hash": "⋕",
478 "heartsuit": "♡",
479 "hookleftarrow": "↩",
480 "hookrightarrow": "↪",
481 "hslash": "ℏ",
482 "iddots": "⋰",
483 "iff": "⟺",
484 "iiiint": "⨌",
485 "iiint": "∭",
486 "iint": "∬",
487 "imath": "ı",
488 "implies": "⟹",
489 "in": "∈",
490 "infty": "∞",
491 "int": "∫",
492 "intercal": "⊺",
493 "interleave": "⫴",
494 "invamp": "⅋",
495 "invdiameter": "⍉",
496 "invneg": "⌐",
497 "iota": "ι",
498 "jmath": "ȷ",
499 "jupiter": "♃",
500 "kappa": "κ",
501 "koppa": "ϟ",
502 "lambda": "λ",
503 "land": "∧",
504 "lang": "⟪",
505 "langle": "⟨",
506 "large": "",
507 "lblot": "⦉",
508 "lbrace": "{",
509 "lbrack": "[",
510 "lceil": "⌈",
511 "ldots": "…",
512 "le": "<",
513 "leadsto": "⤳",
514 "leftarrow": "←",
515 "leftarrowtail": "↢",
516 "leftarrowtriangle": "⇽",
517 "leftbarharpoon": "⥪",
518 "leftharpoondown": "↽",
519 "leftharpoonup": "↼",
520 "leftleftarrows": "⇇",
521 "leftleftharpoons": "⥢",
522 "leftmoon": "☾",
523 "leftrightarrow": "↔",
524 "leftrightarrows": "⇆",
525 "leftrightarrowtriangle": "⇿",
526 "leftrightharpoon": "⥊",
527 "leftrightharpoondown": "⥐",
528 "leftrightharpoons": "⇋",
529 "leftrightharpoonup": "⥎",
530 "leftrightsquigarrow": "↭",
531 "leftslice": "⪦",
532 "leftsquigarrow": "⇜",
533 "leftthreetimes": "⋋",
534 "leftupdownharpoon": "⥑",
535 "leo": "♌",
536 "leq": "≤",
537 "leqq": "≦",
538 "leqslant": "⩽",
539 "lessapprox": "⪅",
540 "lessdot": "⋖",
541 "lesseqgtr": "⋚",
542 "lesseqqgtr": "⪋",
543 "lessgtr": "≶",
544 "lessim": "≲",
545 "lesssim": "≲",
546 "lfloor": "⌊",
547 "lgroup": "⟮",
548 "lhd": "◁",
549 "libra": "♎",
550 "lightning": "↯",
551 "limg": "⦇",
552 "ll": "≪",
553 "llbracket": "⟦",
554 "llcorner": "⌞",
555 "llcurly": "⪻",
556 "lll": "⋘",
557 "lnapprox": "⪉",
558 "lneq": "⪇",
559 "lneqq": "≨",
560 "lnot": "¬",
561 "lnsim": "⋦",
562 "longleftarrow": "⟵",
563 "longleftrightarrow": "⟷",
564 "longmapsfrom": "⟻",
565 "longmapsto": "⟼",
566 "longrightarrow": "⟶",
567 "looparrowleft": "↫",
568 "looparrowright": "↬",
569 "lor": "∨",
570 "lozenge": "◊",
571 "lrcorner": "⌟",
572 "ltimes": "⋉",
573 "male": "♂",
574 "maltese": "✠",
575 "mapsfrom": "↤",
576 "mapsto": "↦",
577 "measuredangle": "∡",
578 "medbullet": "⚫",
579 "medcirc": "⚪",
580 "mercury": "☿",
581 "mho": "℧",
582 "mid": "∣",
583 "mlcp": "⫛",
584 "mod": " mod ",
585 "models": "⊧",
586 "mp": "∓",
587 "mu": "μ",
588 "multimap": "⊸",
589 "multimapboth": "⧟",
590 "multimapdotbothA": "⊶",
591 "multimapdotbothB": "⊷",
592 "multimapinv": "⟜",
593 "nLeftarrow": "⇍",
594 "nLeftrightarrow": "⇎",
595 "nRightarrow": "⇏",
596 "nVDash": "⊯",
597 "nVdash": "⊮",
598 "nabla": "∇",
599 "napprox": "≉",
600 "natural": "♮",
601 "ncong": "≇",
602 "nearrow": "↗",
603 "neg": "¬",
604 "neptune": "♆",
605 "neq": "≠",
606 "nequiv": "≢",
607 "nexists": "∄",
608 "ngeq": "≱",
609 "ngtr": "≯",
610 "ni": "∋",
611 "nleftarrow": "↚",
612 "nleftrightarrow": "↮",
613 "nleq": "≰",
614 "nless": "≮",
615 "nmid": "∤",
616 "nni": "∌",
617 "normalsize": "",
618 "not\\in": "∉",
619 "not\\ni": "∌",
620 "not\\preceq": "⋠",
621 "not\\subset": "⊄",
622 "not\\subseteq": "⊈",
623 "not\\succeq": "⋡",
624 "not\\supset": "⊅",
625 "not\\supseteq": "⊉",
626 "not\\trianglelefteq": "⋬",
627 "not\\trianglerighteq": "⋭",
628 "not\\vartriangleleft": "⋪",
629 "not\\vartriangleright": "⋫",
630 "notasymp": "≭",
631 "notbackslash": "⍀",
632 "notin": "∉",
633 "notslash": "⌿",
634 "nparallel": "∦",
635 "nprec": "⊀",
636 "npreceq": "⋠",
637 "nrightarrow": "↛",
638 "nsim": "≁",
639 "nsimeq": "≄",
640 "nsqsubseteq": "⋢",
641 "nsqsupseteq": "⋣",
642 "nsubset": "⊄",
643 "nsubseteq": "⊈",
644 "nsucc": "⊁",
645 "nsucceq": "⋡",
646 "nsupset": "⊅",
647 "nsupseteq": "⊉",
648 "ntriangleleft": "⋪",
649 "ntrianglelefteq": "⋬",
650 "ntriangleright": "⋫",
651 "ntrianglerighteq": "⋭",
652 "nu": "ν",
653 "nvDash": "⊭",
654 "nvdash": "⊬",
655 "nwarrow": "↖",
656 "odot": "⊙",
657 "oiiint": "∰",
658 "oiint": "∯",
659 "oint": "∮",
660 "ointctrclockwise": "∳",
661 "omega": "ω",
662 "ominus": "⊖",
663 "oplus": "⊕",
664 "oslash": "⊘",
665 "otimes": "⊗",
666 "over": "/",
667 "overbrace": "⏞",
668 "overleftrightarrow": "x⃡",
669 "overparen": "⏜",
670 "overset?=": "≟",
671 "overset{?}{=}": "≟",
672 "overset{\\operatorname{def}}{=}": "≝",
673 "parallel": "∥",
674 "partial": "∂",
675 "pencil": "✎",
676 "perp": "⊥",
677 "pfun": "⇸",
678 "phi": "ϕ",
679 "pi": "π",
680 "pinj": "⤔",
681 "pisces": "♓",
682 "pitchfork": "⋔",
683 "pluto": "♇",
684 "pm": "±",
685 "pointright": "☞",
686 "pounds": "£",
687 "prec": "≺",
688 "precapprox": "⪷",
689 "preccurlyeq": "≼",
690 "preceq": "⪯",
691 "preceqq": "⪳",
692 "precnapprox": "⪹",
693 "precnsim": "⋨",
694 "precsim": "≾",
695 "prime": "′",
696 "prod": "∏",
697 "propto": "∝",
698 "psi": "ψ",
699 "psur": "⤀",
700 "qoppa": "ϙ",
701 "quad": " ",
702 "quarternote": "♩",
703 "radiation": "☢",
704 "rang": "⟫",
705 "rangle": "⟩",
706 "rarr": "→",
707 "rblot": "⦊",
708 "rbrace": "}",
709 "rbrack": "]",
710 "rceil": "⌉",
711 "recycle": "♻",
712 "rfloor": "⌋",
713 "rgroup": "⟯",
714 "rhd": "▷",
715 "rho": "ρ",
716 "rightangle": "∟",
717 "rightarrow": "→",
718 "rightarrowtail": "↣",
719 "rightarrowtriangle": "⇾",
720 "rightbarharpoon": "⥬",
721 "rightharpoondown": "⇁",
722 "rightharpoonup": "⇀",
723 "rightleftarrows": "⇄",
724 "rightleftharpoon": "⥋",
725 "rightleftharpoons": "⇌",
726 "rightmoon": "☽",
727 "rightrightarrows": "⇉",
728 "rightrightharpoons": "⥤",
729 "rightslice": "⪧",
730 "rightsquigarrow": "⇝",
731 "rightthreetimes": "⋌",
732 "rightupdownharpoon": "⥏",
733 "rimg": "⦈",
734 "risingdotseq": "≓",
735 "rrbracket": "⟧",
736 "rsub": "⩥",
737 "rtimes": "⋊",
738 "sagittarius": "♐",
739 "sampi": "ϡ",
740 "saturn": "♄",
741 "scorpio": "♏",
742 "scriptsize": "",
743 "searrow": "↘",
744 "second": "″",
745 "setminus": "⧵",
746 "sharp": "♯",
747 "sigma": "σ",
748 "sim": "∼",
749 "simeq": "≃",
750 "sixteenthnote": "♬",
751 "skull": "☠",
752 "slash": "∕",
753 "small": "",
754 "smallsetminus": "∖",
755 "smalltriangledown": "▿",
756 "smalltriangleleft": "◃",
757 "smalltriangleright": "▹",
758 "smalltriangleup": "▵",
759 "smile": "⌣",
760 "smiley": "☺",
761 "spadesuit": "♠",
762 "spddot": "¨",
763 "sphat": "^",
764 "sphericalangle": "∢",
765 "spot": "⦁",
766 "sptilde": "~",
767 "sqcap": "⊓",
768 "sqcup": "⊔",
769 "sqint": "⨖",
770 "sqrt": "√", # ∛ ∜ - partly special handling below
771 "sqrt[3]": "∛",
772 "sqrt[4]": "∜",
773 "sqsubset": "⊏",
774 "sqsubseteq": "⊑",
775 "sqsupset": "⊐",
776 "sqsupseteq": "⊒",
777 "square": "□",
778 "sslash": "⫽",
779 "star": "⋆",
780 "steaming": "☕",
781 "stigma": "ϛ",
782 "strictfi": "⥼",
783 "strictif": "⥽",
784 "subset": "⊂",
785 "subseteq": "⊆",
786 "subseteqq": "⫅",
787 "subsetneq": "⊊",
788 "subsetneqq": "⫋",
789 "succ": "≻",
790 "succapprox": "⪸",
791 "succcurlyeq": "≽",
792 "succeq": "⪰",
793 "succeqq": "⪴",
794 "succnapprox": "⪺",
795 "succnsim": "⋩",
796 "succsim": "≿",
797 "sum": "∑",
798 "sun": "☼",
799 "supset": "⊃",
800 "supseteq": "⊇",
801 "supseteqq": "⫆",
802 "supsetneq": "⊋",
803 "supsetneqq": "⫌",
804 "swarrow": "↙",
805 "swords": "⚔",
806 "talloblong": "⫾",
807 "tau": "τ",
808 "taurus": "♉",
809 "tcohm": "Ω",
810 "textbackslash": "\\",
811 "textbar": "|",
812 "textbullet": "•",
813 "textgreater": ">",
814 "textless": "<",
815 "textprime": "′",
816 "therefore": "∴",
817 "theta": "θ",
818 "third": "‴",
819 "times": "×",
820 "tiny": "",
821 "to": "→",
822 "top": "⊤",
823 "triangle": "∆",
824 "trianglelefteq": "⊴",
825 "triangleq": "≜",
826 "trianglerighteq": "⊵",
827 "twoheadleftarrow": "↞",
828 "twoheadrightarrow": "↠",
829 "twonotes": "♫",
830 "ulcorner": "⌜",
831 "underbar": " ̱",
832 "underbrace": "⏟",
833 "underleftarrow": "x⃮",
834 "underline": " ̲",
835 "underparen": "⏝",
836 "underrightarrow": "x⃯",
837 "uparrow": "↑",
838 "updownarrow": "↕",
839 "updownarrows": "⇅",
840 "updownharpoons": "⥮",
841 "upharpoonleft": "↿",
842 "upharpoonright": "↾",
843 "uplus": "⊎",
844 "upsilon": "υ",
845 "upuparrows": "⇈",
846 "upupharpoons": "⥣",
847 "uranus": "♅",
848 "urcorner": "⌝",
849 "utilde": " ̰",
850 "vDash": "⊨",
851 "varbeta": "β",
852 "varclubsuit": "♧",
853 "vardiamondsuit": "♦",
854 "varepsilon": "ε",
855 "varheartsuit": "♥",
856 "varkappa": "ϰ",
857 "varnothing": "∅",
858 "varointclockwise": "∲",
859 "varphi": "φ",
860 "varpi": "ϖ",
861 "varprod": "⨉",
862 "varrho": "ϱ",
863 "varsigma": "ς",
864 "varspadesuit": "♤",
865 "vartheta": "θ",
866 "vartriangleleft": "⊲",
867 "vartriangleright": "⊳",
868 "vdash": "⊢",
869 "vdots": "⋮",
870 "vee": "∨",
871 "veebar": "⊻",
872 "vert": "|",
873 "virgo": "♍",
874 "warning": "⚠",
875 "wasylozenge": "⌑",
876 "wedge": "∧",
877 "widehat=": "≙",
878 "widehat{=}": "≙",
879 "wp": "℘",
880 "wr": "≀",
881 "xi": "ξ",
882 "yen": "¥",
883 "yinyang": "☯",
884 "zcmp": "⨟",
885 "zeta": "ζ",
886 "zhide": "⧹",
887 "zpipe": "⨠",
888 "zproject": "⨡",
889 "|": "‖",
890 # Accents XXX these really should be handled specially with diacritics
891 # after argument
892 "acute": "́",
893 "bar": "̄",
894 "breve": "̆",
895 "check": "̌",
896 "ddddot": "⃜",
897 "dddot": "⃛",
898 "ddot": "̈",
899 "ddots": "⋱",
900 "dot": "̇",
901 "grave": "̀",
902 "hat": "̂",
903 "lvec": "⃐",
904 "mathring": "̊",
905 "not": "̸",
906 "overline": "◌̅",
907 "tilde": "̃",
908 "vec": "⃑",
909 # Some ignored operators
910 "bigl": "",
911 "bigr": "",
912 "left": "",
913 "right": "",
914 "style": "",
915 "textstyle": "",
916 "mathrm": "",
917}
919mathcal_map: dict[str, str] = {
920 "A": "𝒜",
921 "B": "ℬ",
922 "C": "𝒞",
923 "D": "𝒟",
924 "E": "ℰ",
925 "F": "ℱ",
926 "G": "𝒢",
927 "H": "ℋ",
928 "I": "ℐ",
929 "J": "𝒥",
930 "K": "𝒦",
931 "L": "ℒ",
932 "M": "ℳ",
933 "N": "𝒩",
934 "O": "𝒪",
935 "P": "𝒫",
936 "Q": "𝒬",
937 "R": "ℛ",
938 "S": "𝒮",
939 "T": "𝒯",
940 "U": "𝒰",
941 "V": "𝒱",
942 "W": "𝒲",
943 "X": "𝒳",
944 "Y": "𝒴",
945 "Z": "𝒵",
946 "a": "𝒶",
947 "b": "𝒷",
948 "c": "𝒸",
949 "d": "𝒹",
950 "e": "ℯ",
951 "f": "𝒻",
952 "g": "ℊ",
953 "h": "𝒽",
954 "i": "𝒾",
955 "j": "𝒿",
956 "k": "𝓀",
957 "l": "𝓁",
958 "m": "𝓂",
959 "n": "𝓃",
960 "o": "ℴ",
961 "p": "𝓅",
962 "q": "𝓆",
963 "r": "𝓇",
964 "s": "𝓈",
965 "t": "𝓉",
966 "u": "𝓊",
967 "v": "𝓋",
968 "w": "𝓌",
969 "x": "𝓍",
970 "y": "𝓎",
971 "z": "𝓏",
972}
974mathfrak_map: dict[str, str] = {
975 "A": "𝔄",
976 "B": "𝔅",
977 "C": "ℭ",
978 "D": "𝔇",
979 "E": "𝔈",
980 "F": "𝔉",
981 "G": "𝔊",
982 "H": "ℌ",
983 "J": "𝔍",
984 "K": "𝔎",
985 "L": "𝔏",
986 "M": "𝔐",
987 "N": "𝔑",
988 "O": "𝔒",
989 "P": "𝔓",
990 "Q": "𝔔",
991 "S": "𝔖",
992 "T": "𝔗",
993 "U": "𝔘",
994 "V": "𝔙",
995 "W": "𝔚",
996 "X": "𝔛",
997 "Y": "𝔜",
998 "Z": "ℨ",
999}
1001mathbb_map: dict[str, str] = {
1002 "A": "𝔸",
1003 "B": "𝔹",
1004 "C": "ℂ",
1005 "D": "𝔻",
1006 "E": "𝔼",
1007 "F": "𝔽",
1008 "G": "𝔾",
1009 "H": "ℍ",
1010 "I": "𝕀",
1011 "J": "𝕁",
1012 "K": "𝕂",
1013 "L": "𝕃",
1014 "M": "𝕄",
1015 "N": "ℕ",
1016 "O": "𝕆",
1017 "P": "ℙ",
1018 "Q": "ℚ",
1019 "R": "ℝ",
1020 "S": "𝕊",
1021 "T": "𝕋",
1022 "U": "𝕌",
1023 "V": "𝕍",
1024 "W": "𝕎",
1025 "X": "𝕏",
1026 "Y": "𝕐",
1027 "Z": "ℤ",
1028 "a": "𝕒",
1029 "b": "𝕓",
1030 "c": "𝕔",
1031 "d": "𝕕",
1032 "e": "𝕖",
1033 "f": "𝕗",
1034 "g": "𝕘",
1035 "h": "𝕙",
1036 "i": "𝕚",
1037 "j": "𝕛",
1038 "k": "𝕜",
1039 "l": "𝕝",
1040 "m": "𝕞",
1041 "n": "𝕟",
1042 "o": "𝕠",
1043 "p": "𝕡",
1044 "q": "𝕢",
1045 "r": "𝕣",
1046 "s": "𝕤",
1047 "t": "𝕥",
1048 "u": "𝕦",
1049 "v": "𝕧",
1050 "w": "𝕨",
1051 "x": "𝕩",
1052 "y": "𝕪",
1053 "z": "𝕫",
1054 "pi": "ℼ",
1055 "gamma": "ℽ",
1056 "Gamma": "ℾ",
1057 "Pi": "ℿ",
1058 "Sigma": "⅀",
1059 "0": "𝟘",
1060 "1": "𝟙",
1061 "2": "𝟚",
1062 "3": "𝟛",
1063 "4": "𝟜",
1064 "5": "𝟝",
1065 "6": "𝟞",
1066 "7": "𝟟",
1067 "8": "𝟠",
1068 "9": "𝟡",
1069}
1072def mathcal_fn(text: str) -> str:
1073 return "".join(mathcal_map.get(x, x) for x in text)
1076def mathfrak_fn(text: str) -> str:
1077 return "".join(mathfrak_map.get(x, x) for x in text)
1080def mathbb_fn(text: str) -> str:
1081 return "".join(mathbb_map.get(x, x) for x in text)
1084def to_math(text: str) -> str:
1085 """Converts a mathematical formula to ASCII."""
1086 # print("to_math: {!r}".format(text))
1087 magic_vec: list[str] = []
1089 def expand(text: str) -> str:
1090 while True:
1091 orig = text
1092 # formatting with {:c} converts input into character
1093 text = re.sub(
1094 r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST),
1095 lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST],
1096 text,
1097 )
1098 if text == orig:
1099 break
1100 return text
1102 def recurse(text: str) -> str:
1103 def math_magic(
1104 text: str, left: str, right: str, fn: Callable[[str], str]
1105 ) -> str:
1106 regexp_str = r"{}([^{}{}]+){}".format(
1107 re.escape(left),
1108 re.escape(left),
1109 re.escape(right),
1110 re.escape(right),
1111 )
1112 regexp = re.compile(regexp_str)
1114 def repl(m: re.Match) -> str:
1115 magic = chr(MAGIC_FIRST + len(magic_vec))
1116 t = fn(m.group(1)).strip()
1117 magic_vec.append(t)
1118 return magic
1120 while True:
1121 orig = text
1122 text = re.sub(regexp, repl, text)
1123 if text == orig:
1124 break
1125 return text
1127 def expand_group(v: str) -> str:
1128 fn: Optional[Callable[[str], str]] = None
1129 if re.match(r"\\mathcal\b", v):
1130 fn = mathcal_fn
1131 v = v[8:].strip()
1132 elif re.match(r"\\mathfrak\b", v):
1133 fn = mathfrak_fn
1134 v = v[9:].strip()
1135 elif re.match(r"\\mathbb\b", v):
1136 fn = mathbb_fn
1137 v = v[7:]
1138 elif re.match(r"\\(begin|end)\b", v): 1138 ↛ 1139line 1138 didn't jump to line 1139 because the condition on line 1138 was never true
1139 v = "" # Skip
1140 elif re.match(r"\\text\b", v): 1140 ↛ 1141line 1140 didn't jump to line 1141 because the condition on line 1140 was never true
1141 v = v[5:]
1142 elif re.match(r"\\pmod\b", v): 1142 ↛ 1143line 1142 didn't jump to line 1143 because the condition on line 1142 was never true
1143 v = v[5:].strip()
1144 v = "(mod " + expand_group(v) + ")"
1145 elif re.match(r"\\sqrt\[", v): 1145 ↛ 1146line 1145 didn't jump to line 1146 because the condition on line 1145 was never true
1146 a = v[6:-1].strip()
1147 if a == "2":
1148 v = "√"
1149 elif a == "3":
1150 v = "∛"
1151 elif a == "4":
1152 v = "∜"
1153 else:
1154 v = to_superscript(a) + "√"
1155 elif re.match(r"\\sqrt($|[0-9]|\b)", v): 1155 ↛ 1156line 1155 didn't jump to line 1156 because the condition on line 1155 was never true
1156 v = "√"
1157 elif re.match(r"\\(frac|binom)($|[0-9]|\b)", v):
1158 m = re.match(
1159 r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*"
1160 r"(\\[a-zA-Z]+|\\.|.)$",
1161 v,
1162 )
1163 if not m: 1163 ↛ 1164line 1163 didn't jump to line 1164 because the condition on line 1163 was never true
1164 print("MATH FRAC/BINOM ERROR: {!r}".format(v))
1165 return v
1166 op, a, b = m.groups()
1167 a = expand_group(a).strip()
1168 b = expand_group(b).strip()
1169 if len(a) > 1:
1170 a = "(" + a + ")"
1171 if len(b) > 1:
1172 b = "(" + b + ")"
1173 if op == "frac": 1173 ↛ 1175line 1173 didn't jump to line 1175 because the condition on line 1173 was always true
1174 v = a + "/" + b
1175 elif op == "binom":
1176 v = "binom({}, {})".format(a, b)
1177 else:
1178 # Should never get here
1179 v = "{}({})".format(op, v)
1180 elif v.startswith("_"):
1181 fn = to_subscript
1182 v = v[1:]
1183 elif v.startswith("^"):
1184 fn = to_superscript
1185 v = v[1:]
1186 if v.startswith("\\"):
1187 mapped = math_map.get(v[1:].strip())
1188 if mapped is None: 1188 ↛ 1189line 1188 didn't jump to line 1189 because the condition on line 1188 was never true
1189 if v[1:].strip().isalnum():
1190 v = " " + v[1:].strip() + " "
1191 else:
1192 v = v[1:].strip()
1193 else:
1194 v = mapped
1195 elif v.isspace() or v in ("&",): # Ignore certain special chars 1195 ↛ 1196line 1195 didn't jump to line 1196 because the condition on line 1195 was never true
1196 v = ""
1197 if fn is not None:
1198 v = expand(v)
1199 v = fn(v)
1200 v = expand(v)
1201 return v
1203 parts: list[str] = []
1204 while True:
1205 orig = text
1206 text = math_magic(text, "{", "}", recurse)
1207 if text == orig:
1208 break
1209 for m in re.finditer(
1210 r"\s+|"
1211 r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*"
1212 r"(\\dot\\(bigvee|cup|cap|lor|vee)|"
1213 r"\\not\\(subset|supset|subseteq|supseteq|in|ni|"
1214 r"preceq|succeq|vartrianglelefteq|"
1215 r"vartrianglerighteq|trianglelefteq|"
1216 r"trianglerighteq)|"
1217 r"\\widehat\{=\}|\\widehat=|"
1218 r"\\overset\{?\}\{=\}|"
1219 r"\\overset\?=|"
1220 r"\\overset\{\\operatorname\{def\}\}\{=\}|"
1221 r"\\[a-zA-Z]+|\\.|.)|"
1222 r"(\\(mathcal|mathfrak|mathbb|text|begin|end|pmod)"
1223 r"\b\s*|"
1224 r"\\sqrt\b(\[\d+\])?)?"
1225 r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)",
1226 text,
1227 ):
1228 v = m.group(0).strip()
1229 if not v:
1230 continue
1231 v = expand_group(v)
1232 if v: 1232 ↛ 1209line 1232 didn't jump to line 1209 because the condition on line 1232 was always true
1233 if (
1234 parts and parts[-1][-1].isalpha() and v[0] in "0123456789"
1235 ) or (
1236 parts
1237 and parts[-1][-1] in "0123456789"
1238 and v[0] in "0123456789"
1239 ):
1240 v = " " + v
1241 parts.append(v)
1243 text = "".join(parts)
1244 return text
1246 text = recurse(text)
1247 # print("math text final: {!r}".format(text))
1248 return text
1251def bold_follows(parts: list[str], i: int) -> bool:
1252 """Checks if there is a bold (''') in parts after parts[i]. We allow
1253 intervening italics ('')."""
1254 parts = parts[i + 1 :]
1255 for p in parts: 1255 ↛ 1260line 1255 didn't jump to line 1260 because the loop on line 1255 didn't complete
1256 if not p.startswith("''"):
1257 continue
1258 if p.startswith("'''"): 1258 ↛ 1255line 1258 didn't jump to line 1255 because the condition on line 1258 was always true
1259 return True
1260 return False
1263def remove_italic_and_bold(text: str) -> str:
1264 """Based on token_iter in wikitextprocessor"""
1265 assert isinstance(text, str)
1266 lines = re.split(r"(\n+)", text) # Lines and separators
1267 parts_re = re.compile(r"(''+)")
1268 new_text_parts = []
1269 for line in lines:
1270 parts = re.split(parts_re, line)
1271 state = 0 # 1=in italic 2=in bold 3=in both
1272 for i, part in enumerate(parts):
1273 if part.startswith("''"):
1274 # This is a bold/italic part. Scan the rest of the line
1275 # to determine how it should be interpreted if there are
1276 # more than two apostrophes.
1277 if part.startswith("'''''"):
1278 if state == 1: # in italic 1278 ↛ 1279line 1278 didn't jump to line 1279 because the condition on line 1278 was never true
1279 part = part[5:]
1280 state = 2
1281 elif state == 2: # in bold 1281 ↛ 1282line 1281 didn't jump to line 1282 because the condition on line 1281 was never true
1282 part = part[5:]
1283 state = 1
1284 elif state == 3: # in both
1285 state = 0
1286 part = part[5:]
1287 else: # in nothing
1288 part = part[5:]
1289 state = 3
1290 elif part.startswith("'''"):
1291 if state == 1: # in italic
1292 if bold_follows(parts, i): 1292 ↛ 1296line 1292 didn't jump to line 1296 because the condition on line 1292 was always true
1293 part = part[3:]
1294 state = 3
1295 else:
1296 part = part[2:]
1297 state = 0
1298 elif state == 2: # in bold
1299 part = part[3:]
1300 state = 0
1301 elif state == 3: # in both
1302 part = part[3:]
1303 state = 1
1304 else: # in nothing
1305 part = part[3:]
1306 state = 2
1307 elif part.startswith("''"): 1307 ↛ 1320line 1307 didn't jump to line 1320 because the condition on line 1307 was always true
1308 if state == 1: # in italic
1309 part = part[2:]
1310 state = 0
1311 elif state == 2: # in bold 1311 ↛ 1312line 1311 didn't jump to line 1312 because the condition on line 1311 was never true
1312 part = part[2:]
1313 state = 3
1314 elif state == 3: # in both 1314 ↛ 1315line 1314 didn't jump to line 1315 because the condition on line 1314 was never true
1315 part = part[2:]
1316 state = 2
1317 else: # in nothing
1318 part = part[2:]
1319 state = 1
1320 if part:
1321 new_text_parts.append(part)
1322 continue
1323 new_text_parts.append(part)
1324 new_text_parts.append("\n")
1325 new_text_parts = new_text_parts[:-1] # remove last \n
1326 return "".join(new_text_parts)
1329# regex to find File/Image link attributes that would mean an image
1330# is *not* inline
1331NOT_INLINE_IMG_RE = re.compile(r"\|\s*(right|left|center|thumb|frame)\s*\|")
1334URL_STARTS_RE = re.compile(
1335 r"({})".format(r"|".join(URL_STARTS)), flags=re.IGNORECASE
1336)
1338IMAGE_LINK_RE: Optional[re.Pattern] = None
1341def clean_value(
1342 wxr: WiktextractContext, title: str, no_strip=False, no_html_strip=False
1343) -> str:
1344 """Cleans a title or value into a normal string. This should basically
1345 remove any Wikimedia formatting from it: HTML tags, templates, links,
1346 emphasis, etc. This will also merge multiple whitespaces into one
1347 normal space and will remove any surrounding whitespace."""
1348 assert isinstance(wxr, WiktextractContext)
1349 assert isinstance(title, str)
1351 global IMAGE_LINK_RE
1352 if IMAGE_LINK_RE is None:
1353 image_link_prefixes = wxr.wtp.namespace_prefixes(
1354 wxr.wtp.NAMESPACE_DATA["File"]["id"], suffix=""
1355 )
1356 IMAGE_LINK_RE = re.compile(
1357 rf"(?:{'|'.join(image_link_prefixes)})\s*:", re.IGNORECASE
1358 )
1360 def repl_1(m: re.Match) -> str:
1361 return clean_value(wxr, m.group(1), no_strip=True)
1363 def repl_exturl(m: re.Match) -> str:
1364 args = re.split(r"\s+", m.group(1))
1365 i = 0
1366 while i < len(args) - 1:
1367 if not URL_STARTS_RE.match(args[i]):
1368 break
1369 i += 1
1370 return " ".join(args[i:])
1372 def repl_link(m: re.Match) -> str:
1373 before_colon = m.group(1)
1374 after_colon = m.group(3)
1375 if (
1376 before_colon is not None
1377 and IMAGE_LINK_RE.match(before_colon) is not None
1378 ):
1379 return ""
1380 if before_colon is not None and before_colon.strip(": ") in ("w", "s"):
1381 # Wikipedia or Wikisource link
1382 v = after_colon.split("|")[0]
1383 else:
1384 v = m.group(0).strip("[] ").split("|")[0]
1385 return clean_value(wxr, v, no_strip=True)
1387 def repl_link_bars(m: re.Match) -> str:
1388 link = m.group(1)
1389 if IMAGE_LINK_RE.match(link) is not None:
1390 # Handle File / Image / Fichier 'links' here.
1391 if NOT_INLINE_IMG_RE.match(m.group(0)) is None and "alt" in m.group(
1392 0
1393 ):
1394 # This image should be inline, so let's print its alt text
1395 alt_m = re.search(r"\|\s*alt\s*=([^]|]+)(\||\]\])", m.group(0))
1396 if alt_m is not None: 1396 ↛ 1398line 1396 didn't jump to line 1398 because the condition on line 1396 was always true
1397 return "[Alt: " + alt_m.group(1) + "]"
1398 return ""
1399 # m.group(5) is always the last matching group because you can
1400 # only access the last matched group; the indexes don't 'grow'
1401 return clean_value(wxr, m.group(5) or m.group(2) or "", no_strip=True)
1403 def repl_1_sup(m: re.Match) -> str:
1404 return to_superscript(clean_value(wxr, m.group(1)))
1406 def repl_1_sub(m: re.Match) -> str:
1407 return to_subscript(clean_value(wxr, m.group(1)))
1409 def repl_1_chem(m: re.Match) -> str:
1410 return to_chem(clean_value(wxr, m.group(1)))
1412 def repl_1_math(m: re.Match) -> str:
1413 v = to_math(m.group(1))
1414 # print("to_math:", ascii(v))
1415 return v
1417 def repl_1_syntaxhighlight(m: re.Match) -> str:
1418 # Content is preformatted
1419 return "\n" + m.group(1).strip() + "\n"
1421 # remove nowiki tag returned from `Wtp.node_to_html()`
1422 title = re.sub(r"<nowiki\s*/>", "", title)
1424 # Remove any remaining templates
1425 # title = re.sub(r"\{\{[^}]+\}\}", "", title)
1427 # Remove tables, which can contain other tables
1428 prev = ""
1429 while title != prev:
1430 prev = title
1431 title = re.sub(
1432 r"\{\|((?!\{\|)(?!\|\}).)*\|\}",
1433 "\n",
1434 title,
1435 flags=re.DOTALL,
1436 )
1437 # title = re.sub(r"(?s)\{\|.*?\|\}", "\n", title)
1438 # Remove second reference tags (<ref name="ref_name"/>)
1439 title = re.sub(r"<ref\s+name=\"[^\"]+\"\s*/>", "", title)
1440 # Remove references (<ref>...</ref>).
1441 title = re.sub(r"(?is)<ref\b\s*[^>/]*?>\s*.*?</ref\s*>", "", title)
1442 # Replace <span>...</span> by stripped content without newlines
1443 title = re.sub(
1444 r"(?is)<span\b\s*[^>]*?>(.*?)\s*</span\s*>",
1445 lambda m: re.sub(r"\s+", " ", m.group(1)),
1446 title,
1447 )
1448 # Replace <br/> by comma space (it is used to express alternatives in some
1449 # declensions)
1450 title = re.sub(r"(?si)\s*<br\s*/?>\n*", "\n", title)
1451 # Remove divs with floatright class (generated e.g. by {{ja-kanji|...}})
1452 title = re.sub(
1453 r'(?si)<div\b[^>]*?\bclass="[^"]*?\bfloatright\b[^>]*?>'
1454 r"((<div\b(<div\b.*?</div\s*>|.)*?</div>)|.)*?"
1455 r"</div\s*>",
1456 "",
1457 title,
1458 )
1459 # Remove divs with float: attribute
1460 title = re.sub(
1461 r'(?si)<div\b[^>]*?\bstyle="[^"]*?\bfloat:[^>]*?>'
1462 r"((<div\b(<div\b.*?</div\s*>|.)*?</div>)|.)*?"
1463 r"</div\s*>",
1464 "",
1465 title,
1466 )
1467 # Remove <sup> with previewonly class (generated e.g. by {{taxlink|...}})
1468 title = re.sub(
1469 r'(?si)<sup\b[^>]*?\bclass="[^"<>]*?'
1470 r"\bpreviewonly\b[^>]*?>"
1471 r".+?</sup\s*>",
1472 "",
1473 title,
1474 )
1475 # Remove <strong class="error">...</strong>
1476 title = re.sub(
1477 r'(?si)<strong\b[^>]*?\bclass="[^"]*?\berror\b[^>]*?>'
1478 r".+?</strong\s*>",
1479 "",
1480 title,
1481 )
1482 # Change <div> and </div> to newlines. Ditto for tr, li, table, dl, ul, ol
1483 title = re.sub(r"(?si)</?(div|tr|li|table|dl|ul|ol)\b[^>]*>", "\n", title)
1484 # Change <dt>, <dd>, </dt> and </dd> into newlines;
1485 # these generate new rows/lines.
1486 title = re.sub(r"(?i)</?d[dt]\s*>", "\n", title)
1487 # Change <td> </td> to spaces. Ditto for th.
1488 title = re.sub(r"(?si)</?(td|th)\b[^>]*>", " ", title)
1489 # Change <sup> ... </sup> to ^
1490 title = re.sub(r"(?si)<sup\b[^>]*>\s*</sup\s*>", "", title)
1491 title = re.sub(r"(?si)<sup\b[^>]*>(.*?)</sup\s*>", repl_1_sup, title)
1492 # Change <sub> ... </sub> to _
1493 title = re.sub(r"(?si)<sub\b[^>]*>\s*</sub\s*>", "", title)
1494 title = re.sub(r"(?si)<sub\b[^>]*>(.*?)</sub\s*>", repl_1_sub, title)
1495 # Change <chem> ... </chem> using subscripts for digits
1496 title = re.sub(r"(?si)<chem\b[^>]*>(.*?)</chem\s*>", repl_1_chem, title)
1497 # Change <math> ... </math> using special formatting.
1498 title = re.sub(r"(?si)<math\b[^>]*>(.*?)</math\s*>", repl_1_math, title)
1499 # Change <syntaxhighlight> ... </syntaxhighlight> using special formatting.
1500 title = re.sub(
1501 r"(?si)<syntaxhighlight\b[^>]*>(.*?)" r"</syntaxhighlight\s*>",
1502 repl_1_syntaxhighlight,
1503 title,
1504 )
1505 # Remove any remaining HTML tags.
1506 if not no_html_strip:
1507 title = re.sub(r"(?s)<[/!a-zA-Z][^>]*>", "", title)
1508 title = re.sub(r"(?s)</[^>]+>", "", title)
1509 else:
1510 # Strip <noinclude/> anyway
1511 title = re.sub(r"(?si)<noinclude\s*/\s*>", "", title)
1512 # Replace [...]
1513 title = re.sub(r"(?s)\[\s*\.\.\.\s*\]", "…", title)
1514 # Remove http links in superscript
1515 title = re.sub(r"\^\(\[?(https?:)?//[^]()]+\]?\)", "", title)
1516 # Remove any edit links to local pages
1517 title = re.sub(r"\[//[^]\s]+\s+edit\s*\]", "", title)
1518 # Replace links by their text
1520 category_ns_data: NamespaceDataEntry
1521 # XXX "Category" -> config variable for portability
1522 category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item]
1523 # Fail if we received empty dict from .get()
1524 category_ns_names = {"Category", category_ns_data["name"]} | set(
1525 category_ns_data["aliases"]
1526 )
1527 category_names_pattern = rf"(?:{'|'.join(category_ns_names)})"
1528 while True:
1529 # Links may be nested, so keep replacing until there is no more change.
1530 orig = title
1531 title = re.sub(
1532 rf"(?si)\s*\[\[\s*{category_names_pattern}\s*:\s*([^]]+?)\s*\]\]",
1533 "",
1534 title,
1535 )
1536 title = re.sub(
1537 r"(?s)\[\[\s*:?([^]|#<>:]+?)\s*(#[^][|<>]*?)?\]\]", repl_1, title
1538 )
1539 title = re.sub(
1540 r"(?s)\[\[\s*(([\w\d]+)\s*:)?\s*([^][#|<>]+?)"
1541 r"\s*(#[^][|]*?)?\|?\]\]",
1542 repl_link,
1543 title,
1544 )
1545 title = re.sub(
1546 r"(?s)\[\[\s*([^][|<>]+?)\s*\|"
1547 r"\s*(([^][|]|\[[^]]*\])+?)"
1548 r"(\s*\|\s*(([^][|]|\[[^]]*\])+?))*\s*\|*\]\]",
1549 repl_link_bars,
1550 title,
1551 )
1552 if title == orig:
1553 break
1554 # Replace remaining HTML links by the URL.
1555 while True:
1556 orig = title
1557 title = re.sub(
1558 r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]", repl_exturl, title
1559 )
1560 if title == orig:
1561 break
1563 # Remove italic and bold
1564 title = remove_italic_and_bold(title)
1566 # Replace HTML entities
1567 title = html.unescape(title)
1568 title = title.replace("\xa0", " ") # nbsp
1569 # Remove left-to-right and right-to-left, zero-with characters
1570 title = re.sub(r"[\u200e\u200f\u200b\u200d\u200c\ufeff]", "", title)
1571 # Replace whitespace sequences by a single space.
1572 title = re.sub(r"[ \t\r]+", " ", title)
1573 title = re.sub(r" *\n+", "\n", title)
1574 # Eliminate spaces around ellipsis in brackets
1575 title = re.sub(r"\[\s*…\s*\]", "[…]", title)
1577 # This unicode quote seems to be used instead of apostrophe quite randomly
1578 # (about 4% of apostrophes in English entries, some in Finnish entries).
1579 # title = re.sub("\u2019", "'", title) # Note: no r"..." here!
1580 # Replace strange unicode quotes with normal quotes
1581 # title = re.sub(r"”", '"', title)
1582 # Replace unicode long dash by normal dash
1583 # title = re.sub(r"–", "-", title)
1585 # Remove whitespace before periods and commas etc
1586 # XXX we might re-enable this, now trying without as it is removing some
1587 # instances where we would want to leave the space
1588 # title = re.sub(r" ([.,;:!?)])", repl_1, title)
1589 # Strip surrounding whitespace.
1590 if not no_strip:
1591 title = title.strip()
1592 # Normalize different ways of writing accents into the NFC canonical form
1593 title = unicodedata.normalize("NFC", title)
1594 return title
1597def clean_template_args(
1598 wxr: WiktextractContext,
1599 ht: Union[TemplateArgs, TemplateParameters],
1600 no_strip=False,
1601) -> dict[Union[str, int], str]:
1602 """Cleans all values in a template argument dictionary and returns the
1603 cleaned dictionary."""
1604 assert isinstance(wxr, WiktextractContext)
1605 assert isinstance(ht, dict)
1606 return {
1607 clean_value(wxr, str(k), no_html_strip=True): clean_value(
1608 wxr, str(v), no_strip=no_strip, no_html_strip=True
1609 )
1610 for k, v in ht.items()
1611 }