Coverage for src/wiktextract/extractor/en/english_words.py: 100%
12 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Vocabulary of known English words.
2#
3# The vocabulary is mostly based on ntlk brown corpus, but we add some words
4# and exclude some words. These will likely need to be tweaked semi-frequently
5# to add support for unrecognized sense descriptions.
6#
7# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
9import nltk # type: ignore[import-untyped]
10from nltk.corpus import brown # type: ignore[import-untyped]
12from .form_descriptions_known_firsts import known_firsts # w/ our additions
14# Download Brown corpus if not already downloaded
15try:
16 nltk.data.find("corpora/brown.zip")
17except LookupError:
18 nltk.download("brown", quiet=True)
20# English words added to the default set from Brown corpus. Multi-word
21# expressions separated by spaces can also be added but must match the whole
22# text (they can be used when we don't want to add the components).
23additional_words = set(
24 [
25 "'",
26 "ʹ",
27 ".",
28 ";",
29 ":",
30 "!",
31 "‘",
32 "’",
33 '"',
34 "“",
35 "”",
36 '"',
37 ",",
38 "…",
39 "...",
40 "“.”",
41 "—",
42 "€",
43 "1st",
44 "2nd",
45 "3rd",
46 "4th",
47 "5th",
48 "6th",
49 "7th",
50 "8th",
51 "9th",
52 "10th",
53 "100th",
54 "AIDS",
55 "AM",
56 "ATP",
57 "Ada Semantic Interface Specification",
58 "Afghanistan",
59 "Al Jazeera",
60 "Albulidae",
61 "Apple",
62 "Arabic kaf",
63 "Arabic waw",
64 "Aristophanean",
65 "ASCII",
66 "BBC",
67 "BDSM",
68 "BMW",
69 "BS",
70 "Bardet-Biedl syndrome",
71 "Beetle",
72 "Bekenstein-Hawking entropy",
73 "Blu-ray",
74 "Blu-ray Disc",
75 "Bohai Sea",
76 "Caniformia",
77 "Canoidea",
78 "Caprobrotus",
79 "Chaetodontinae",
80 "Common",
81 "Compatibility Decomposition",
82 "Coriandum",
83 "Cryptodiran",
84 "Czech",
85 "DNA",
86 "Dirac",
87 "Dr",
88 "Epiprocta",
89 "Esau",
90 "Eucharist",
91 "Euclidean",
92 "Exmoor",
93 "Feliformia",
94 "Feloidea",
95 "GUI",
96 "GameCube",
97 "Global Positioning System",
98 "Guantanamo",
99 "Gurmukhi digits",
100 "HCHO",
101 "HMMWV",
102 "HTTP",
103 "Handedness",
104 "Hearthstone",
105 "Hollandic",
106 "Horae",
107 "Hue Whiteness Blackness",
108 "I Ching hexagrams",
109 "IPA",
110 "ISO",
111 "Indo",
112 "Inoperable",
113 "Internet",
114 "Judeo",
115 "LGBT",
116 "Lagerstomia",
117 "Latinized",
118 "Linux",
119 "Lunar Module",
120 "Lyman continuum photon",
121 "Mac",
122 "Mach",
123 "Markarian",
124 "Masturbation",
125 "Maulisa",
126 "McDonald's",
127 "Mercenaria",
128 "Merseyside",
129 "Metric",
130 "Monetaria",
131 "Mr",
132 "Mr Spock",
133 "Mrs",
134 "Ms",
135 "Mugillidae",
136 "Multiples",
137 "NCO",
138 "Nepali",
139 "New",
140 "Nintendo",
141 "Noh",
142 "Numbers",
143 "Nun",
144 "Onchorhynchus",
145 "Orgasm",
146 "OS",
147 "Palmaiola",
148 "Pentecost",
149 "Phoenician",
150 "Plebidonax",
151 "PM",
152 "Pornography",
153 "Prof",
154 "Roma",
155 "Romani",
156 "Russian krai",
157 "Russophile",
158 "SARS",
159 "SI",
160 "Sandwich",
161 "Saskatchewan",
162 "Shahmukhi digits",
163 "Silent Information Regulator",
164 "Sony",
165 "Southern",
166 "Spanish-speaking",
167 "THz",
168 "Tamil digits",
169 "Telugu digits",
170 "Turkic",
171 "Twitter",
172 "UAV",
173 "USB",
174 "USD",
175 "USSF",
176 "Unicode",
177 "Uranus",
178 "Urdu digits",
179 "Valais",
180 "Volkswagen",
181 "X-Files",
182 "WC",
183 "WW2",
184 "Wallis",
185 "Web",
186 "Wi-Fi",
187 "Windows",
188 "World",
189 "XML Paper Specification",
190 "abbreviation",
191 "abdicate",
192 "abdication",
193 "abhor",
194 "abhorrence",
195 "abnormality",
196 "abiotic",
197 "aboriginals",
198 "aborted",
199 "abouts",
200 "abrasive",
201 "abridging",
202 "abscess",
203 "absorbent",
204 "abstinent",
205 "abuser",
206 "acanthesthesia",
207 "accusatorial",
208 "acetous",
209 "acetylcarnitine",
210 "acetylsalicylic",
211 "acidic",
212 "acne",
213 "acorn",
214 "acquiescent",
215 "acrimonious",
216 "acrimony",
217 "acromegaly",
218 "activist",
219 "acyclic",
220 "acyl",
221 "addict",
222 "addend",
223 "adicity",
224 "admonish",
225 "adornment",
226 "adpositions",
227 "adulterer",
228 "adulterous",
229 "aeroplane",
230 "affectedly",
231 "affixes",
232 "affordable",
233 "afterthought",
234 "agnathia",
235 "agoraphobia",
236 "agression",
237 "aground",
238 "airbag",
239 "airtight",
240 "ait",
241 "albumen",
242 "alchemist",
243 "aldehyde",
244 "aldohexose",
245 "alga",
246 "alimentary",
247 "aliphatic",
248 "allele",
249 "allergen",
250 "allergological",
251 "alleyway",
252 "allotrope",
253 "allude",
254 "almond",
255 "alms",
256 "alphabets",
257 "alpine",
258 "ambergris",
259 "ammeter",
260 "amoeba",
261 "amorously",
262 "amphetamine",
263 "amphibian",
264 "amphibole",
265 "amputate",
266 "anachronistic",
267 "anaemia",
268 "anaemic",
269 "anal",
270 "angiosperms",
271 "angiotensin",
272 "angled",
273 "angler",
274 "angleworm",
275 "anglicism",
276 "angstrom",
277 "anilingus",
278 "annealing",
279 "annexation",
280 "anno",
281 "annoyingly",
282 "annuity",
283 "annul",
284 "anoint",
285 "ante",
286 "antechamber",
287 "anteroposterior",
288 "anthill",
289 "anti-doping",
290 "anti-streptolysin",
291 "anticlimax",
292 "anticline",
293 "anticlockwise",
294 "antipyretic",
295 "antisense",
296 "antonym",
297 "antonymous",
298 "anus",
299 "anxiogenic",
300 "aortic",
301 "apatite",
302 "aphaeretic",
303 "aphorisms",
304 "apollonian",
305 "apologue",
306 "apostrophe",
307 "applique",
308 "appendage",
309 "appendectomy",
310 "appendicitis",
311 "appentice",
312 "appetising",
313 "apprentice",
314 "approvable",
315 "aquarium",
316 "aquatic",
317 "arachnid",
318 "archer",
319 "argipalla",
320 "arity",
321 "armour",
322 "armoured",
323 "aromantic",
324 "arse",
325 "arsenolite",
326 "artifact",
327 "artwork",
328 "asbestiform",
329 "aspirate",
330 "asscheek",
331 "assuaging",
332 "astrological",
333 "atrium",
334 "audiovisual",
335 "averring",
336 "avoirdupois",
337 "babble",
338 "backup",
339 "bagpiper",
340 "ballcourt",
341 "ballgame",
342 "ballpoint",
343 "bamboo",
344 "banality",
345 "banknote",
346 "barb",
347 "barefaced",
348 "barrister",
349 "barter",
350 "basset",
351 "bathhouse",
352 "batty",
353 "bead",
354 "beak",
355 "begrudging",
356 "belittle",
357 "belladona",
358 "benefice",
359 "benzoyl",
360 "bequeath",
361 "berbicara",
362 "bereave",
363 "bereaved",
364 "bestiality",
365 "bianwen",
366 "bidirectional",
367 "bigwig",
368 "bilberry",
369 "birthmark",
370 "blabs",
371 "blackbird",
372 "bladder",
373 "blastula",
374 "blockhead",
375 "bloodworts",
376 "blotches",
377 "bluefin",
378 "blurring",
379 "bob",
380 "bobbin",
381 "bodyfat",
382 "bogaraveo",
383 "bollard",
384 "bonsai",
385 "bobsledding",
386 "bookmaker",
387 "bootleg",
388 "boozy",
389 "botcher",
390 "bottomed",
391 "boyfriend",
392 "bra",
393 "braid",
394 "braking",
395 "breakdancer",
396 "breastplate",
397 "breathalyzer",
398 "bribery",
399 "brier",
400 "brimless",
401 "brimming",
402 "bristletail",
403 "broadsword",
404 "browse",
405 "browser",
406 "brutish",
407 "bung",
408 "burbot",
409 "burti",
410 "byte",
411 "caesura",
412 "caipira",
413 "calcareous",
414 "calculator",
415 "camouflaging",
416 "canal",
417 "canard",
418 "candensis",
419 "canid",
420 "cannabis",
421 "canoer",
422 "canoeist",
423 "canton",
424 "capercaillie",
425 "caprice",
426 "capriciously",
427 "caption",
428 "carbonate",
429 "carbonated",
430 "carex",
431 "carnivoran",
432 "carnivore",
433 "carnivorous",
434 "carpus",
435 "cartilaginous",
436 "cartload",
437 "carucates",
438 "cashier",
439 "cassette",
440 "cassia",
441 "cassowary",
442 "castellan",
443 "castes",
444 "castrated",
445 "cataract",
446 "catastrophist",
447 "cation",
448 "cauldron",
449 "causer",
450 "caustic",
451 "cedar",
452 "celluloid",
453 "censoring",
454 "centralised",
455 "cerebropathy",
456 "ceremonious",
457 "cervical",
458 "cetacean",
459 "chainsaw",
460 "chaste",
461 "chastely",
462 "chav",
463 "cheeky",
464 "cheerless",
465 "cheetahs",
466 "cheque",
467 "chessman",
468 "chesspiece",
469 "chewable",
470 "chlorofluorocarbon",
471 "chopsticks",
472 "chrysantemum",
473 "churl",
474 "cinnabar",
475 "cinnamon",
476 "circumcised",
477 "circumvent",
478 "citronella",
479 "clade",
480 "clamp",
481 "clapper",
482 "classifier",
483 "cleanliness",
484 "cleave",
485 "clef",
486 "clitoral",
487 "clitoris",
488 "cloister",
489 "coatroom",
490 "cobbled",
491 "cockfighting",
492 "coddle",
493 "codlings",
494 "codomain",
495 "coenzyme",
496 "cogwheel",
497 "cohabit",
498 "coinage",
499 "collectivisation",
500 "collide",
501 "colour",
502 "colourless",
503 "columbium",
504 "combinatorial",
505 "commandery",
506 "commemoration",
507 "common linnet et al",
508 "compasses",
509 "complainer",
510 "comprehensible",
511 "conceit",
512 "concha",
513 "concubine",
514 "condiment",
515 "condom",
516 "conductance",
517 "confection",
518 "conformable",
519 "conforming",
520 "congeal",
521 "congealable",
522 "congee",
523 "conical",
524 "conjuring",
525 "connector",
526 "consession",
527 "console",
528 "constable",
529 "constellation",
530 "contaminant",
531 "contemn",
532 "contort",
533 "contractions",
534 "coolie",
535 "copula",
536 "copular",
537 "copulate",
538 "copulation",
539 "cornel",
540 "cornucopiodes",
541 "corvid",
542 "cosmogony",
543 "costermonger",
544 "councillor",
545 "counsellor",
546 "countably",
547 "counterintuitive",
548 "countrified",
549 "courier",
550 "cowpat",
551 "cowshed",
552 "crabby",
553 "cracker",
554 "cranberry",
555 "crayon",
556 "creatine",
557 "creatinine",
558 "creditor",
559 "cremation",
560 "creole",
561 "crewed",
562 "cribbage",
563 "cricketer",
564 "cringe",
565 "criticise",
566 "croissant",
567 "croquet",
568 "crossbar",
569 "crossbow",
570 "crossword",
571 "crosswords",
572 "crumb",
573 "crustacean",
574 "crustaceans",
575 "crybaby",
576 "cuckoldry",
577 "cuckoo",
578 "cucumber",
579 "cuirass",
580 "cultivar",
581 "culvert",
582 "cum",
583 "cursive",
584 "curvaceous",
585 "custard",
586 "cutie",
587 "cuttlefish",
588 "cutlery",
589 "cybernetics",
590 "cycling",
591 "cyclone",
592 "cypro",
593 "cytopharynx",
594 "dab",
595 "daimyo",
596 "daresay",
597 "darken",
598 "dart",
599 "dawdle",
600 "daydream",
601 "deaconship",
602 "debased",
603 "debit",
604 "decaffeinated",
605 "decapod",
606 "deceitfulness",
607 "decipher",
608 "deciphered",
609 "decoction",
610 "defamatory",
611 "defame",
612 "defecation",
613 "defile",
614 "definiteness",
615 "degenerate",
616 "dehusking",
617 "deifying",
618 "deity",
619 "dejected",
620 "deleted",
621 "deltoidal",
622 "dementia",
623 "demo",
624 "demolish",
625 "demonym",
626 "denim",
627 "denture",
628 "deponent",
629 "depressed",
630 "derisorily",
631 "designator",
632 "desorption",
633 "despicable",
634 "detent",
635 "dexterous",
636 "diacritics",
637 "diaeresis",
638 "diaper",
639 "dictionaries",
640 "digressing",
641 "digust",
642 "dike",
643 "dimness",
644 "diplomatique",
645 "dipterous",
646 "disadvantageous",
647 "disallow",
648 "disavow",
649 "discoloured",
650 "disconnect",
651 "disconnection",
652 "discrepant",
653 "disembark",
654 "dishonour",
655 "dispensable",
656 "dispirited",
657 "displeasing",
658 "disputatively",
659 "disrespectful",
660 "diss",
661 "dissipatedisyllabicity",
662 "distaff",
663 "disulfide",
664 "doer",
665 "dogfight",
666 "dogfish",
667 "domesticated",
668 "doorhandle",
669 "doorpost",
670 "dorsal",
671 "dotard",
672 "doughnut",
673 "download",
674 "downmarket",
675 "doyen",
676 "dreadlock",
677 "dreadlocks",
678 "dredge",
679 "duckling",
680 "dude",
681 "dull-witted",
682 "dunce",
683 "dupe",
684 "duplicating",
685 "duplicity",
686 "dye",
687 "dyes",
688 "dyestuff",
689 "eater",
690 "eavesdrop",
691 "echinoderms",
692 "eclectic",
693 "ecosystem",
694 "ecstacy",
695 "ectoderm",
696 "effervescing",
697 "egregious",
698 "eigenvector",
699 "ejaculate",
700 "ejaculation",
701 "electromechanical",
702 "electroplate",
703 "elephantiasis",
704 "em dash",
705 "emaciation",
706 "email",
707 "emoticon",
708 "encasing",
709 "encephalomyelitis",
710 "enclitic",
711 "enclose",
712 "endocrine",
713 "enforcer",
714 "engrave",
715 "engross",
716 "enliven",
717 "enquire",
718 "entangle",
719 "entangled",
720 "entice",
721 "entitlement",
722 "entrails",
723 "entrenchment",
724 "enumerate",
725 "enumerating",
726 "envelops",
727 "epichoric",
728 "epilepsy",
729 "epistle",
730 "equinox",
731 "esophagus",
732 "espresso",
733 "estrange",
734 "etc",
735 "etching",
736 "ethane",
737 "ethnicity",
738 "ethology",
739 "ethylene",
740 "euro",
741 "euthanize",
742 "evergreen",
743 "exaction",
744 "exam",
745 "exclesior",
746 "excommunication",
747 "excrement",
748 "excrete",
749 "excretement",
750 "exhale",
751 "exhort",
752 "exine",
753 "explainable",
754 "expletive",
755 "extortion",
756 "extravagantly",
757 "extraverted",
758 "eyelet",
759 "factious",
760 "faeces",
761 "faggot",
762 "fairground",
763 "falsely",
764 "fandom",
765 "fanfiction",
766 "fart",
767 "farthing",
768 "fastener",
769 "feces",
770 "feigns",
771 "feline",
772 "felines",
773 "fellatio",
774 "fellator",
775 "feminin",
776 "fend",
777 "feng",
778 "feng shui",
779 "fengshui",
780 "feral",
781 "fester",
782 "fetter",
783 "fewness",
784 "fiancé",
785 "fiancée",
786 "fibre",
787 "figuratively",
788 "filches",
789 "filching",
790 "fillet",
791 "fillets",
792 "filterer",
793 "filtration",
794 "finalise",
795 "firearm",
796 "firebreak",
797 "firefighter",
798 "fireside",
799 "firmware",
800 "fishnet",
801 "fishy",
802 "fissure",
803 "flatbed",
804 "flattish",
805 "flavour",
806 "flea",
807 "flightless",
808 "foehn",
809 "fondle",
810 "footprint",
811 "footrest",
812 "fop",
813 "forcefully",
814 "ford",
815 "foreshow",
816 "fossil",
817 "fraternal",
818 "fratricide",
819 "fraudulent",
820 "fraudulently",
821 "fredag",
822 "freemasonic",
823 "freestyle",
824 "frequentative",
825 "freshwater",
826 "fridge",
827 "frigate",
828 "frisson",
829 "fritter",
830 "frontflip",
831 "frontotemporal",
832 "frugal",
833 "fulfilment",
834 "fumigating",
835 "functionality",
836 "fundoshi",
837 "furry",
838 "furthest",
839 "gadoid",
840 "gameplay",
841 "gamling",
842 "gastropod",
843 "gatepost",
844 "gelatinous",
845 "gemstone",
846 "genderqueer",
847 "genealogy",
848 "generative",
849 "generic",
850 "generically",
851 "genericized",
852 "genital",
853 "genitalia",
854 "genitals",
855 "genitourinary",
856 "genus",
857 "geometrid",
858 "getter",
859 "ghostwriter",
860 "giga-",
861 "giraffe",
862 "girder",
863 "girlfriend",
864 "ginseng",
865 "gizzard",
866 "glans",
867 "glassworks",
868 "glowworm",
869 "glutton",
870 "glycoside",
871 "goalkeeper",
872 "goalpost",
873 "gobble",
874 "goby-like",
875 "god-given",
876 "goddesses",
877 "gonad",
878 "goodwill",
879 "gorged",
880 "gouge",
881 "graceless",
882 "grafting",
883 "grandchild",
884 "gratuity",
885 "gravedigger",
886 "grebe",
887 "grid",
888 "grouch",
889 "groupers",
890 "grouse",
891 "guarantor",
892 "guilder",
893 "guillotine",
894 "guitarfish",
895 "guillemets",
896 "habitation",
897 "habitational",
898 "hagberry",
899 "hairstyle",
900 "hamster",
901 "handball",
902 "harbinger",
903 "hardcore",
904 "harmonize",
905 "harvester",
906 "harvesters",
907 "hashish",
908 "hassock",
909 "hatefully",
910 "hawksbill",
911 "hawthorn",
912 "hayfield",
913 "hazarded",
914 "headlight",
915 "headlong",
916 "heaths",
917 "hemp",
918 "heraldic",
919 "heraldry",
920 "herbal",
921 "heterosexual",
922 "hi",
923 "hieroglyphs",
924 "hilted",
925 "hip-hop",
926 "hircinous",
927 "hives",
928 "hoarfrost",
929 "hoariness",
930 "hoe",
931 "holiness",
932 "holly",
933 "homeless",
934 "homie",
935 "homosexuality",
936 "honorific",
937 "hornet",
938 "horny",
939 "horseshoe",
940 "horticultural",
941 "hostel",
942 "houseboat",
943 "howin",
944 "hulled",
945 "humiliate",
946 "humour",
947 "hump",
948 "husked",
949 "hydroxylase",
950 "hyperactivity",
951 "hyperlink",
952 "hypersensitivity",
953 "hypersonic",
954 "hyphen",
955 "ichthyological",
956 "icon",
957 "icositetrahedron",
958 "ignoble",
959 "ikebana",
960 "illicitly",
961 "illiteracy",
962 "imaginable",
963 "immaturely",
964 "immerse",
965 "immune",
966 "impermeable",
967 "impiously",
968 "impregnate",
969 "imprison",
970 "impure",
971 "in-law",
972 "inappropriately",
973 "incredulousness",
974 "incriminate",
975 "indefinably",
976 "indentation",
977 "indistinguishably",
978 "ineptitude",
979 "infatuated",
980 "inflectional",
981 "informer",
982 "infraclass",
983 "infrakingdom",
984 "infraorder",
985 "infraphylum",
986 "ingesting",
987 "inhabitant",
988 "inhabiting",
989 "inhale",
990 "injure",
991 "inlaying",
992 "innapropriate",
993 "inoffensive",
994 "inoperable",
995 "inoperative",
996 "inscribe",
997 "insinuate",
998 "inspan",
999 "instrumentalist",
1000 "intenseness",
1001 "intoxication",
1002 "intoxification",
1003 "inventiveness",
1004 "irascible",
1005 "irritate",
1006 "islamic",
1007 "islet",
1008 "isotope",
1009 "jack",
1010 "javelin",
1011 "jellyfish",
1012 "jerkily",
1013 "jokingly",
1014 "junket",
1015 "kaf",
1016 "kangaroo",
1017 "kanji",
1018 "katydid",
1019 "kayak",
1020 "kestrel",
1021 "ketamine",
1022 "kidskin",
1023 "killjoy",
1024 "kilo-",
1025 "kilt",
1026 "kinase",
1027 "kingfisher",
1028 "kitsch",
1029 "kiwi",
1030 "knighthood",
1031 "kookaburra",
1032 "kowtow",
1033 "kroepoek",
1034 "kung fu",
1035 "labial",
1036 "labour",
1037 "lair",
1038 "lamprey",
1039 "lampshade",
1040 "landmass",
1041 "landmasses",
1042 "laptop",
1043 "larch",
1044 "larva",
1045 "lascivious",
1046 "latte",
1047 "lattice",
1048 "laughable",
1049 "leafless",
1050 "lecherous",
1051 "leech",
1052 "leek",
1053 "leftover",
1054 "legless",
1055 "lemming",
1056 "leniusculus",
1057 "leotard",
1058 "lesbian",
1059 "lettuce",
1060 "lexeme",
1061 "lichen",
1062 "lifespan",
1063 "ligature",
1064 "lighthouse",
1065 "lily",
1066 "litre",
1067 "little sis",
1068 "lizard",
1069 "loanword",
1070 "loggerhead",
1071 "loiter",
1072 "longline",
1073 "loofah",
1074 "lottery",
1075 "lowercase",
1076 "ludifica",
1077 "luxuriant",
1078 "lye",
1079 "madder",
1080 "mafia",
1081 "magnanimous",
1082 "magnetite",
1083 "magnorder",
1084 "manageable",
1085 "mangoes",
1086 "manna",
1087 "manoeuvre",
1088 "manroot",
1089 "maqaf",
1090 "marmot",
1091 "marsh",
1092 "marshy",
1093 "marsupial",
1094 "masturbate",
1095 "masturbates",
1096 "masturbating",
1097 "masturbation",
1098 "masturbator",
1099 "materialise",
1100 "matra",
1101 "mayfly",
1102 "mead",
1103 "meagre",
1104 "mediates",
1105 "mediator",
1106 "mega-",
1107 "megalitre",
1108 "melanin",
1109 "meningitis",
1110 "menorah",
1111 "menstrual",
1112 "meow",
1113 "mercenaria",
1114 "mercenary",
1115 "meridiem",
1116 "mesmerism",
1117 "metalworks",
1118 "metamphetamine",
1119 "methamphetamine",
1120 "methane",
1121 "metric",
1122 "microcomputer",
1123 "microprocessor",
1124 "midbrain",
1125 "milkman",
1126 "millet",
1127 "millstone",
1128 "minifig",
1129 "minifigure",
1130 "minting",
1131 "minuscules",
1132 "mire",
1133 "misbehave",
1134 "miscarriage",
1135 "miserly",
1136 "mislead",
1137 "misspelling",
1138 "misspelt",
1139 "mite",
1140 "mitral stenosis",
1141 "modem",
1142 "module",
1143 "modulus",
1144 "mollusc",
1145 "mollusk",
1146 "mongrel",
1147 "monogram",
1148 "monopolizing",
1149 "monosemy",
1150 "monosilane",
1151 "monotheistic",
1152 "moonshine",
1153 "moralization",
1154 "morel",
1155 "motorcycle",
1156 "motorsport",
1157 "motorsports",
1158 "moult",
1159 "mourner",
1160 "mouselike",
1161 "mouthpart",
1162 "mow",
1163 "muddle",
1164 "mugwort",
1165 "mulberry",
1166 "multiplier",
1167 "muntjac",
1168 "mutation",
1169 "myalgic",
1170 "mythical",
1171 "nags",
1172 "nape",
1173 "narrate",
1174 "naturopathic",
1175 "naughtily",
1176 "nave",
1177 "neighbour",
1178 "nerd",
1179 "nescio",
1180 "networking",
1181 "neume",
1182 "neurotransmitter",
1183 "newsflash",
1184 "nictinic",
1185 "nightjar",
1186 "nimble",
1187 "ninjutsu",
1188 "niobium",
1189 "nipple",
1190 "nitric",
1191 "nitrite",
1192 "noh",
1193 "noice",
1194 "nomen",
1195 "non-Roma",
1196 "nonchalance",
1197 "nonessential",
1198 "nonfatal",
1199 "nonstandard",
1200 "nontrivial",
1201 "nonzero",
1202 "noodles",
1203 "normality",
1204 "nosocomial",
1205 "notionally",
1206 "nucleon",
1207 "numeral",
1208 "numeric",
1209 "nuqta",
1210 "oar",
1211 "oars",
1212 "obese",
1213 "oblast",
1214 "obligatory",
1215 "obnoxiously",
1216 "obtuse",
1217 "octahedral",
1218 "octave",
1219 "odour",
1220 "oligonucleotide",
1221 "om",
1222 "omnivorous",
1223 "onerous",
1224 "online",
1225 "oppress",
1226 "ore",
1227 "organinc",
1228 "organisation",
1229 "oscillate",
1230 "osier",
1231 "osmanthus",
1232 "ostmanthus",
1233 "otolaryngology",
1234 "ouch",
1235 "outergarment",
1236 "outtake",
1237 "ouzel",
1238 "overseeing",
1239 "overshoe",
1240 "overstate",
1241 "overstep",
1242 "overused",
1243 "ovum",
1244 "oxgang",
1245 "paddle",
1246 "paenungulates",
1247 "palatalized",
1248 "palmistry",
1249 "paltry",
1250 "pancake",
1251 "pancakes",
1252 "pantherine",
1253 "papules",
1254 "paralysed",
1255 "paraphrasis",
1256 "parenthetical",
1257 "parere",
1258 "parietal",
1259 "paronomasia",
1260 "participle",
1261 "parvorder",
1262 "pasta",
1263 "pastern",
1264 "patchy",
1265 "paternal",
1266 "patty",
1267 "pawl",
1268 "pawpaw",
1269 "pedant",
1270 "pediment",
1271 "peevish",
1272 "peloton",
1273 "pelt",
1274 "penetrable",
1275 "penguin",
1276 "penile",
1277 "penis",
1278 "penitent",
1279 "pentatonic",
1280 "perceivable",
1281 "perceptiveness",
1282 "perfluorooctanoic",
1283 "perineum",
1284 "perjurer",
1285 "peroxidase",
1286 "perspire",
1287 "pervert",
1288 "pessimist",
1289 "petal",
1290 "petrel",
1291 "petrol",
1292 "pewter",
1293 "phenylalanine",
1294 "phobia",
1295 "phoneme",
1296 "photocopier",
1297 "photocopy",
1298 "photosynthetic",
1299 "phthisic",
1300 "phthisical",
1301 "phylogenetics",
1302 "phylum",
1303 "pickpocket",
1304 "piddle",
1305 "piecework",
1306 "pierce",
1307 "pigmentation",
1308 "pilfered",
1309 "pinecone",
1310 "pinewood",
1311 "pistil",
1312 "pixelization",
1313 "placable",
1314 "placeholder",
1315 "placenta",
1316 "plantlike",
1317 "playlist",
1318 "pleasurable",
1319 "plectrum",
1320 "plinth",
1321 "ploughgate",
1322 "ploughgates",
1323 "plunderer",
1324 "plural",
1325 "pointy",
1326 "pokeweed",
1327 "pollute",
1328 "polycyclic",
1329 "polyglot",
1330 "polygon",
1331 "polyhedra",
1332 "polyhedron",
1333 "polyiamond",
1334 "polytheistic",
1335 "polytope",
1336 "polyurethane",
1337 "pomelo",
1338 "pommel",
1339 "pons",
1340 "ponyfish",
1341 "popcorn",
1342 "portend",
1343 "positiveness",
1344 "possibly",
1345 "posteroanterior",
1346 "postposition",
1347 "postpositional",
1348 "potable",
1349 "prawn",
1350 "precipitous",
1351 "predatory",
1352 "predicative",
1353 "prefix",
1354 "premeditated",
1355 "preservative",
1356 "preternatural",
1357 "primrose",
1358 "prismatic",
1359 "proclitic",
1360 "procreate",
1361 "profanities",
1362 "prolapse",
1363 "promiscuous",
1364 "pronated",
1365 "prong",
1366 "pronunciation",
1367 "proofreading",
1368 "prosthetic",
1369 "protector",
1370 "prothrombin",
1371 "protists",
1372 "proto",
1373 "protracting",
1374 "provident",
1375 "provider",
1376 "provocativeness",
1377 "provoking",
1378 "psychometrics",
1379 "psychopathological",
1380 "pubic",
1381 "pudding",
1382 "puffin",
1383 "purloin",
1384 "purr",
1385 "pushchair",
1386 "pushy",
1387 "pyrotechnic",
1388 "quad",
1389 "quadrilateral",
1390 "quahog",
1391 "quantifying",
1392 "quark",
1393 "queue",
1394 "quiche",
1395 "quietude",
1396 "quilt",
1397 "quiver",
1398 "radiotherapy",
1399 "ramie",
1400 "rapids",
1401 "raptors",
1402 "rashly",
1403 "raven",
1404 "ravenously",
1405 "ravine",
1406 "reactive",
1407 "readied",
1408 "realtime",
1409 "redskin",
1410 "redstart",
1411 "reed",
1412 "reentry",
1413 "reeve",
1414 "refinedly",
1415 "refiner",
1416 "reflexive",
1417 "reflexively",
1418 "refutation",
1419 "regardful",
1420 "regnant",
1421 "regressive",
1422 "reindeer",
1423 "reintegrationist",
1424 "reinvigorated",
1425 "relenting",
1426 "relinquishment",
1427 "remiss",
1428 "renounce",
1429 "reordered",
1430 "repairer",
1431 "reprimand",
1432 "reproductory",
1433 "reptile",
1434 "republican",
1435 "reset",
1436 "restroom",
1437 "retract",
1438 "retread",
1439 "reunification",
1440 "reusable",
1441 "reveler",
1442 "revengefully",
1443 "rhetorical",
1444 "rhinarium",
1445 "rhombus",
1446 "rhotic",
1447 "rhubarb",
1448 "ribavirin",
1449 "riffraff",
1450 "ripen",
1451 "riverbed",
1452 "roasting",
1453 "rockhopper",
1454 "roe",
1455 "roman",
1456 "romanisation",
1457 "romanization",
1458 "rook",
1459 "roundel",
1460 "rout",
1461 "rudiments",
1462 "rugby",
1463 "rumination",
1464 "rummage",
1465 "saman",
1466 "samurai",
1467 "sandbank",
1468 "satirize",
1469 "saucer",
1470 "sautéed",
1471 "saveloy",
1472 "savoury",
1473 "sawfly",
1474 "sawhorse",
1475 "scabby",
1476 "scabs",
1477 "scaleless",
1478 "scampi",
1479 "scarecrow",
1480 "schoolbag",
1481 "scoff",
1482 "scoffs",
1483 "scold",
1484 "scraper",
1485 "screwdriver",
1486 "scribal",
1487 "scroll",
1488 "scrotum",
1489 "scuba",
1490 "scurf",
1491 "scythe",
1492 "seabird",
1493 "seaduck",
1494 "seagull",
1495 "seaplane",
1496 "seaport",
1497 "seemly",
1498 "seer",
1499 "selfishly",
1500 "selfsame",
1501 "semen",
1502 "semiconductor",
1503 "semimetal",
1504 "semipermeable",
1505 "senso",
1506 "sentimental",
1507 "separator",
1508 "sepulchring",
1509 "sequentially",
1510 "shamelessly",
1511 "shamisen",
1512 "shaojiu",
1513 "shark",
1514 "sheepfold",
1515 "shifter",
1516 "shindig",
1517 "shitting",
1518 "shoal",
1519 "shoemaker",
1520 "shoemaking",
1521 "shoeshine",
1522 "shuffleboard",
1523 "shuttlecock",
1524 "sibling",
1525 "siblings",
1526 "sickbed",
1527 "sideband",
1528 "sidespin",
1529 "silkworm",
1530 "silt",
1531 "silverfish",
1532 "skateboard",
1533 "skein",
1534 "skerry",
1535 "skier",
1536 "sled",
1537 "sleeved",
1538 "sleeveless",
1539 "sloth",
1540 "slut",
1541 "slutty",
1542 "smegma",
1543 "sob",
1544 "sodomite",
1545 "software",
1546 "solfège",
1547 "solicitation",
1548 "sorcerer",
1549 "sorceress",
1550 "sororal",
1551 "spaceflight",
1552 "spacetime",
1553 "spadix",
1554 "spar",
1555 "sparingly",
1556 "sparrow",
1557 "spasmodic",
1558 "specesi",
1559 "speciality",
1560 "spellings",
1561 "sperm",
1562 "spiderweb",
1563 "spirally",
1564 "spiro",
1565 "spiteful",
1566 "spitefully",
1567 "splint",
1568 "spool",
1569 "spore",
1570 "spotnape",
1571 "spp", # Commonly used abbreviation "spp."
1572 # for subspecies in species names
1573 "sprinkles",
1574 "sprite",
1575 "spritsail",
1576 "spruiks",
1577 "squander",
1578 "squeegee",
1579 "squid",
1580 "squint",
1581 "stabbing",
1582 "stalk",
1583 "stamen",
1584 "standalone",
1585 "starthistle",
1586 "steadfast",
1587 "steadfastness",
1588 "stealthy",
1589 "stenosis",
1590 "sth",
1591 "sthg",
1592 "stich",
1593 "sticker",
1594 "stinginess",
1595 "stinks",
1596 "stockaded",
1597 "stomachache",
1598 "stonechat",
1599 "storey",
1600 "stork",
1601 "stowaway",
1602 "straightness",
1603 "stricto",
1604 "strident",
1605 "stupefy",
1606 "subalgebra",
1607 "subbranch",
1608 "subclass",
1609 "subfamily",
1610 "subgenre",
1611 "subgenus",
1612 "subgroup",
1613 "subkingdom",
1614 "sublimely",
1615 "submatrix",
1616 "submerge",
1617 "suborder",
1618 "subphylum",
1619 "subscriber",
1620 "subsesquiplicate",
1621 "subset",
1622 "subsets",
1623 "subsonic",
1624 "substance",
1625 "subtribe",
1626 "succinctness",
1627 "sudoku",
1628 "sulk",
1629 "sumo",
1630 "sundial",
1631 "sunflower",
1632 "sunglasses",
1633 "sunshade",
1634 "sunshower",
1635 "superannuated",
1636 "supercharger",
1637 "superclass",
1638 "supercluster",
1639 "superdivision",
1640 "superdivisions",
1641 "superfamily",
1642 "superkingdom",
1643 "superorder",
1644 "superphylum",
1645 "supersede",
1646 "superunit",
1647 "surpassingly",
1648 "sustainer",
1649 "sutra",
1650 "swag",
1651 "swearword",
1652 "sweetener",
1653 "sweetening",
1654 "swimmer",
1655 "swimwear",
1656 "swindle",
1657 "swindler",
1658 "swoon",
1659 "swordfish",
1660 "symbiotic",
1661 "synaeresis",
1662 "syncope",
1663 "syperphylum",
1664 "systematics",
1665 "tableware",
1666 "tadpole",
1667 "tailcoat",
1668 "tallness",
1669 "tampon",
1670 "tanker",
1671 "tare",
1672 "tartrazine",
1673 "tastelessly",
1674 "tattle",
1675 "tattletale",
1676 "tattoo",
1677 "taxon",
1678 "taxonomic",
1679 "taxonomy",
1680 "tearful",
1681 "telecom",
1682 "telecommunication",
1683 "teller",
1684 "tera-",
1685 "tern",
1686 "terrene",
1687 "teshuva",
1688 "tesseract",
1689 "testicles",
1690 "tetrafluoromethane",
1691 "tetrafluoromonosilane",
1692 "tetragrams",
1693 "tetrahedron",
1694 "thorax",
1695 "thrombocytopenic",
1696 "thrombotic",
1697 "thunderstorm",
1698 "tibia",
1699 "tiddlywinks",
1700 "tieute",
1701 "tithe",
1702 "toady",
1703 "tofore",
1704 "tomography",
1705 "toothed",
1706 "topological",
1707 "topology",
1708 "torturer",
1709 "touchable",
1710 "towpath",
1711 "trainee",
1712 "tram",
1713 "trans",
1714 "transfinite",
1715 "transliteration",
1716 "transonic",
1717 "treachery",
1718 "tremulous",
1719 "trendy",
1720 "trepidation",
1721 "trickery",
1722 "triterpenoid",
1723 "trove",
1724 "trowelling",
1725 "truncations",
1726 "tsardom",
1727 "tuber",
1728 "tugboat",
1729 "tuna",
1730 "turmeric",
1731 "turner",
1732 "turnip",
1733 "tutelary",
1734 "twig",
1735 "twine",
1736 "two-up",
1737 "typeset",
1738 "typographer",
1739 "tyre",
1740 "unanswerable",
1741 "unassuming",
1742 "uncaring",
1743 "unchallenging",
1744 "unchaste",
1745 "uncircumcised",
1746 "uncivilised",
1747 "uncivilized",
1748 "uncomplicated",
1749 "unconventionally",
1750 "uncooked",
1751 "uncouth",
1752 "uncut",
1753 "undecided",
1754 "undergarment",
1755 "underpants",
1756 "understudy",
1757 "undulate",
1758 "undulation",
1759 "unevenly",
1760 "unfashionable",
1761 "unfasten",
1762 "unfavourable",
1763 "unfrequented",
1764 "ungulate",
1765 "unholy",
1766 "uninformed",
1767 "unintelligent",
1768 "unlikable",
1769 "unmoving",
1770 "unpeeled",
1771 "unprocessed",
1772 "unproven",
1773 "unraveling",
1774 "unravelled",
1775 "unravelling",
1776 "unrestrained",
1777 "unroll",
1778 "unscrupulously",
1779 "unsolicited",
1780 "unsorted",
1781 "unsound",
1782 "unspecialized",
1783 "unspecific",
1784 "untamed",
1785 "untried",
1786 "ununtrium",
1787 "unveiling",
1788 "unwell",
1789 "unworried",
1790 "uppercase",
1791 "urchin",
1792 "urinate",
1793 "urination",
1794 "usance",
1795 "utensil",
1796 "uterus",
1797 "vacating",
1798 "vacillate",
1799 "vandalize",
1800 "vane",
1801 "vapour",
1802 "var.",
1803 "variants",
1804 "verbose",
1805 "verlan",
1806 "verso",
1807 "vertebra",
1808 "vesicle",
1809 "vespers",
1810 "vibrance",
1811 "vibrate",
1812 "videotaped",
1813 "vim",
1814 "viol",
1815 "viper",
1816 "visor",
1817 "vitae",
1818 "voiceless",
1819 "voluptuary",
1820 "vomit",
1821 "voracious",
1822 "vulva",
1823 "wading",
1824 "wafer",
1825 "walkway",
1826 "wank",
1827 "wanker",
1828 "wantonly",
1829 "washerwoman",
1830 "watcher",
1831 "watchfulness",
1832 "watchman",
1833 "waterbirds",
1834 "watercraft",
1835 "waterlilies",
1836 "waw",
1837 "weaverbird",
1838 "webpage",
1839 "weevil",
1840 "wend",
1841 "wether",
1842 "whale",
1843 "whales",
1844 "whirlpool",
1845 "whitefish",
1846 "whitethorn",
1847 "whorl",
1848 "wildcard",
1849 "wildcat",
1850 "wildfire",
1851 "wimp",
1852 "windlass",
1853 "windpipe",
1854 "windscreen",
1855 "windward",
1856 "winemaking",
1857 "winterberry",
1858 "wisent",
1859 "womanlike",
1860 "woody",
1861 "workmate",
1862 "workplace",
1863 "worldliness",
1864 "worshipers",
1865 "worshipper",
1866 "wow",
1867 "wrasse",
1868 "wrench",
1869 "wrestler",
1870 "wrinkly",
1871 "yam",
1872 "yardland",
1873 "yarmulke",
1874 "youthfulness",
1875 "yuan",
1876 "zealotry",
1877 "zoospores",
1878 "zygosperm",
1879 "chamomile",
1880 "peppermint",
1881 "x-axis",
1882 "y-axis",
1883 "z-axis",
1884 "maté",
1885 "Wikimedia",
1886 "Wikipedia",
1887 "Wiktionary",
1888 "jargon",
1889 ]
1890)
1892# These words will never be treated as English words (overriding other
1893# considerations, not just membership in the set)
1894not_english_words_1 = set(
1895 [
1896 # This is a blacklist - these will not be treated as English words
1897 # even though they are in brown.words(). Adding a word on this list
1898 # generally makes it likely to be treated as a romanization.
1899 "ANU",
1900 "Franc",
1901 "Frans",
1902 "Germani",
1903 "Germania",
1904 "J'habitais",
1905 "Kina",
1906 "Mal",
1907 "Mi",
1908 "Mihapjungguk",
1909 "al",
1910 "avec",
1911 "boo",
1912 "de",
1913 "du",
1914 "em",
1915 "lui",
1916 "ma",
1917 "mana",
1918 "novo",
1919 "pronto",
1920 "que",
1921 "extortionary",
1922 "democracy",
1923 "Mrs",
1924 "physical",
1925 "property",
1926 "ransomware",
1927 "program",
1928 "epiglottis",
1929 "laryngeal",
1930 "flap",
1931 "literally",
1932 ]
1933)
1935potentially_english_words = set(
1936 [
1937 "He",
1938 "Ye",
1939 ]
1940)
1942not_english_words = not_english_words_1 | potentially_english_words
1944# Construct a set of (most) English words. Multi-word expressions where we
1945# do not want to include the components can also be put here space-separated.
1946english_words = (
1947 set(brown.words())
1948 | known_firsts
1949 |
1950 # XXX the second words of species names add too much garbage
1951 # now that we accept "english" more loosely.
1952 # set(x for name in known_species for x in name.split()) |
1953 additional_words
1954) - not_english_words