Coverage for src/wiktextract/extractor/en/parts_of_speech.py: 100%
7 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Definitions of extracted parts of speech codes and a mapping from
2# Wiktionary section titles to parts of speech.
3#
4# Copyright (c) 2018-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org
6# This dictionary maps section titles in articles to parts-of-speech. There
7# is a lot of variety and misspellings, and this tries to deal with those.
8from typing import TypedDict
10POSMap = TypedDict(
11 "POSMap",
12 {
13 "pos": str,
14 "debug": str,
15 "tags": list[str],
16 },
17 total=False,
18)
20part_of_speech_map: dict[str, POSMap] = {
21 "abbreviation": {
22 "pos": "abbrev",
23 "debug": "part-of-speech Abbreviation is proscribed",
24 "tags": ["abbreviation"],
25 },
26 "acronym": {
27 "pos": "abbrev",
28 "debug": "part-of-speech Acronym is proscribed",
29 "tags": ["abbreviation"],
30 },
31 "adjectival": {
32 "pos": "adj_noun",
33 "debug": "part-of-speech Adjectival is not valid",
34 },
35 "adjectival noun": {
36 # Not listed as allowed, but common
37 "pos": "adj_noun",
38 },
39 "adjectival verb": {
40 # Not listed as allowed, but common
41 "pos": "adj_verb",
42 },
43 "adjective": {
44 "pos": "adj",
45 },
46 "adjectuve": {
47 "pos": "adj",
48 "debug": "misspelled subtitle",
49 },
50 "adjectives": {
51 "pos": "adj",
52 "debug": "usually used in singular",
53 },
54 "adnominal": {
55 "pos": "adnominal",
56 },
57 "adverb": {
58 "pos": "adv",
59 },
60 "adverbs": {
61 "pos": "adv",
62 "debug": "usually used in singular",
63 },
64 "adverbial phrase": {
65 "pos": "adv_phrase",
66 "debug": "part-of-speech Adverbial phrase is proscribed",
67 },
68 "affix": {
69 "pos": "affix",
70 },
71 "adjective suffix": {
72 "pos": "suffix",
73 "debug": "part-of-speech Adjective suffix is proscribed",
74 },
75 "ambiposition": {
76 "pos": "ambiposition",
77 },
78 "article": {
79 "pos": "article",
80 },
81 "character": {
82 "pos": "character",
83 },
84 "circumfix": {
85 "pos": "circumfix",
86 "tags": ["morpheme"],
87 },
88 "circumposition": {
89 "pos": "circumpos",
90 },
91 "classifier": {
92 "pos": "classifier",
93 },
94 "clipping": {
95 "pos": "abbrev",
96 "debug": "part-of-speech Clipping is proscribed",
97 "tags": ["abbreviation"],
98 },
99 "clitic": {
100 "pos": "suffix",
101 "debug": "part-of-speech Clitic is proscribed",
102 "tags": ["clitic"],
103 },
104 "combining form": {
105 "pos": "combining_form",
106 "tags": ["morpheme"],
107 },
108 "comparative": {
109 "pos": "adj",
110 "tags": ["comparative"],
111 },
112 "conjunction": {
113 "pos": "conj",
114 },
115 "conjuntion": {
116 "pos": "conj",
117 "debug": "misspelled subtitle",
118 },
119 "contraction": {
120 "pos": "contraction",
121 "tags": ["abbreviation"],
122 },
123 "converb": {
124 "pos": "converb",
125 },
126 "counter": {
127 "pos": "counter",
128 },
129 "dependent noun": {
130 "pos": "noun",
131 "tags": [
132 "dependent",
133 ],
134 },
135 "definitions": {
136 # This is used under chinese characters
137 "pos": "character",
138 },
139 "determiner": {
140 "pos": "det",
141 },
142 "diacritical mark": {
143 "pos": "character",
144 "tags": ["diacritic"],
145 },
146 "enclitic": {
147 "pos": "suffix",
148 "tags": ["clitic"],
149 },
150 "enclitic particle": {
151 "pos": "suffix",
152 "tags": ["clitic"],
153 },
154 "gerund": {
155 "pos": "verb",
156 "debug": "part-of-speech Gerund is proscribed",
157 "tags": ["participle", "gerund"],
158 },
159 "han character": {
160 "pos": "character",
161 "tags": ["han"],
162 },
163 "han characters": {
164 "pos": "character",
165 "tags": ["han"],
166 "debug": "psually used in singular",
167 },
168 "hanja": {
169 "pos": "character",
170 "tags": ["Hanja"],
171 },
172 "hanzi": {
173 "pos": "character",
174 "tags": ["hanzi"],
175 },
176 "ideophone": {
177 "pos": "noun",
178 "tags": ["ideophone"],
179 },
180 "idiom": {
181 "pos": "phrase",
182 "tags": ["idiomatic"],
183 # This is too common for now to complain about
184 # "debug": "part-of-speech Idiom is proscribed",
185 },
186 "infix": {
187 "pos": "infix",
188 "tags": ["morpheme"],
189 },
190 "infinitive": {
191 "pos": "verb",
192 "debug": "part-of-speech Infinitive is proscribed",
193 "tags": ["infinitive"],
194 },
195 "initialism": {
196 "pos": "abbrev",
197 "debug": "part-of-speech Initialism is proscribed",
198 "tags": ["abbreviation"],
199 },
200 "interfix": {
201 "pos": "interfix",
202 "tags": ["morpheme"],
203 },
204 "interjection": {
205 "pos": "intj",
206 },
207 "interrogative pronoun": {
208 "pos": "pron",
209 "tags": ["interrogative"],
210 },
211 "intransitive verb": {
212 "pos": "verb",
213 "debug": "part-of-speech Intransitive verb is proscribed",
214 "tags": ["intransitive"],
215 },
216 "instransitive verb": {
217 "pos": "verb",
218 "tags": ["intransitive"],
219 "debug": "pisspelled subtitle",
220 },
221 "kanji": {
222 "pos": "character",
223 "tags": ["kanji"],
224 },
225 "letter": {
226 "pos": "character",
227 "tags": ["letter"],
228 },
229 "ligature": {
230 "pos": "character",
231 "tags": ["ligature"],
232 },
233 "nominal nuclear clause": {
234 "pos": "clause",
235 "debug": "part-of-speech Nominal nuclear clause is proscribed",
236 },
237 "νoun": {
238 "pos": "noun",
239 "debug": "misspelled subtitle",
240 },
241 "nouɲ": {
242 "pos": "noun",
243 "debug": "misspelled subtitle",
244 },
245 "noun": {
246 "pos": "noun",
247 },
248 "noun form": {
249 "pos": "noun",
250 "debug": "part-of-speech Noun form is proscribed",
251 },
252 "nouns": {
253 "pos": "noun",
254 "debug": "usually in singular",
255 },
256 "noum": {
257 "pos": "noun",
258 "debug": "misspelled subtitle",
259 },
260 "number": {
261 "pos": "num",
262 "tags": ["number"],
263 },
264 "numeral": {
265 "pos": "num",
266 },
267 "ordinal number": {
268 "pos": "adj",
269 "debug": "ordinal numbers should be adjectives",
270 "tags": ["ordinal"],
271 },
272 "participle": {
273 "pos": "verb",
274 "tags": ["participle"],
275 },
276 "particle": {
277 "pos": "particle",
278 # XXX Many of these seem to be prefixes or suffixes
279 },
280 "past participle": {
281 "pos": "verb",
282 "tags": ["participle", "past"],
283 },
284 "perfect expression": {
285 "pos": "verb",
286 },
287 "perfection expression": {
288 "pos": "verb",
289 },
290 "perfect participle": {
291 "pos": "verb",
292 "tags": ["participle", "perfect"],
293 },
294 "personal pronoun": {
295 "pos": "pron",
296 "tags": ["person"],
297 },
298 "phrase": {
299 "pos": "phrase",
300 },
301 "phrases": {
302 "pos": "phrase",
303 "debug": "usually used in singular",
304 },
305 "possessive determiner": {
306 "pos": "det",
307 "tags": ["possessive"],
308 },
309 "possessive pronoun": {
310 "pos": "det",
311 "tags": ["possessive"],
312 },
313 "postposition": {
314 "pos": "postp",
315 },
316 "predicative": {
317 "pos": "adj",
318 "tags": ["predicative"],
319 },
320 "prefix": {
321 "pos": "prefix",
322 "tags": ["morpheme"],
323 },
324 "preposition": {
325 "pos": "prep",
326 },
327 "prepositions": {
328 "pos": "prep",
329 "debug": "usually used in singular",
330 },
331 "prepositional expressions": {
332 "pos": "prep",
333 "debug": "part-of-speech Prepositional expressions is proscribed",
334 },
335 "prepositional phrase": {
336 "pos": "prep_phrase",
337 },
338 "prepositional pronoun": {
339 "pos": "pron",
340 "debug": "part-of-speech Prepositional pronoun is proscribed",
341 "tags": ["prepositional"],
342 },
343 "present participle": {
344 "pos": "verb",
345 "debug": "part-of-speech Present participle is proscribed",
346 "tags": ["participle", "present"],
347 },
348 "preverb": {
349 "pos": "preverb",
350 },
351 "pronoun": {
352 "pos": "pron",
353 },
354 "proper noun": {
355 "pos": "name",
356 },
357 "proper oun": {
358 "pos": "name",
359 "debug": "misspelled subtitle",
360 },
361 "proposition": {
362 "pos": "prep", # Appears to be a misspelling of preposition
363 "debug": "misspelled subtitle",
364 },
365 "proverb": {
366 "pos": "proverb",
367 },
368 "punctuation mark": {
369 "pos": "punct",
370 "tags": ["punctuation"],
371 },
372 "punctuation": {
373 "pos": "punct",
374 "debug": "part-of-speech Punctuation should be Punctuation mark",
375 "tags": ["punctuation"],
376 },
377 "relative": {
378 "pos": "conj",
379 "tags": ["relative"],
380 },
381 "romanization": {
382 "pos": "romanization",
383 },
384 "root": {
385 "pos": "root",
386 "tags": ["morpheme"],
387 },
388 "suffix": {
389 "pos": "suffix",
390 "tags": ["morpheme"],
391 },
392 "suffix form": {
393 "pos": "suffix",
394 "debug": "part-of-speech Suffix form is proscribed",
395 "tags": ["morpheme"],
396 },
397 "syllable": {
398 "pos": "syllable",
399 },
400 "symbol": {
401 "pos": "symbol",
402 },
403 "transitive verb": {
404 "pos": "verb",
405 "tags": ["transitive"],
406 },
407 "verb": {
408 "pos": "verb",
409 },
410 "verb form": {
411 "pos": "verb",
412 "debug": "part-of-speech Verb form is proscribed",
413 },
414 "verbal noun": {
415 "pos": "noun",
416 "tags": ["verbal"],
417 },
418 "verbs": {
419 "pos": "verb",
420 "debug": "usually in singular",
421 },
422}
423for k, v in part_of_speech_map.items():
424 if "tags" in v:
425 assert isinstance(v["tags"], (list, tuple))
427# Set of all possible parts-of-speech returned by wiktionary reading.
428PARTS_OF_SPEECH = set(x["pos"] for x in part_of_speech_map.values())