Coverage for src/wiktextract/extractor/en/parts_of

1# Definitions of extracted parts of speech codes and a mapping from

2# Wiktionary section titles to parts of speech.

6# This dictionary maps section titles in articles to parts-of-speech. There

7# is a lot of variety and misspellings, and this tries to deal with those.

8from typing import TypedDict

10POSMap = TypedDict(

11 "POSMap",

12 {

13 "pos": str,

14 "debug": str,

15 "tags": list[str],

16 },

17 total=False,

18)

20part_of_speech_map: dict[str, POSMap] = {

21 "abbreviation": {

22 "pos": "abbrev",

23 "debug": "part-of-speech Abbreviation is proscribed",

24 "tags": ["abbreviation"],

25 },

26 "acronym": {

27 "pos": "abbrev",

28 "debug": "part-of-speech Acronym is proscribed",

29 "tags": ["abbreviation"],

30 },

31 "adjectival": {

32 "pos": "adj_noun",

33 "debug": "part-of-speech Adjectival is not valid",

34 },

35 "adjectival noun": {

36 # Not listed as allowed, but common

37 "pos": "adj_noun",

38 },

39 "adjectival verb": {

40 # Not listed as allowed, but common

41 "pos": "adj_verb",

42 },

43 "adjective": {

44 "pos": "adj",

45 },

46 "adjectuve": {

47 "pos": "adj",

48 "debug": "misspelled subtitle",

49 },

50 "adjectives": {

51 "pos": "adj",

52 "debug": "usually used in singular",

53 },

54 "adnominal": {

55 "pos": "adnominal",

56 },

57 "adverb": {

58 "pos": "adv",

59 },

60 "adverbs": {

61 "pos": "adv",

62 "debug": "usually used in singular",

63 },

64 "adverbial phrase": {

65 "pos": "adv_phrase",

66 "debug": "part-of-speech Adverbial phrase is proscribed",

67 },

68 "affix": {

69 "pos": "affix",

70 },

71 "adjective suffix": {

72 "pos": "suffix",

73 "debug": "part-of-speech Adjective suffix is proscribed",

74 },

75 "ambiposition": {

76 "pos": "ambiposition",

77 },

78 "article": {

79 "pos": "article",

80 },

81 "character": {

82 "pos": "character",

83 },

84 "circumfix": {

85 "pos": "circumfix",

86 "tags": ["morpheme"],

87 },

88 "circumposition": {

89 "pos": "circumpos",

90 },

91 "classifier": {

92 "pos": "classifier",

93 },

94 "clipping": {

95 "pos": "abbrev",

96 "debug": "part-of-speech Clipping is proscribed",

97 "tags": ["abbreviation"],

98 },

99 "clitic": {

100 "pos": "suffix",

101 "debug": "part-of-speech Clitic is proscribed",

102 "tags": ["clitic"],

103 },

104 "combining form": {

105 "pos": "combining_form",

106 "tags": ["morpheme"],

107 },

108 "comparative": {

109 "pos": "adj",

110 "tags": ["comparative"],

111 },

112 "conjunction": {

113 "pos": "conj",

114 },

115 "conjuntion": {

116 "pos": "conj",

117 "debug": "misspelled subtitle",

118 },

119 "contraction": {

120 "pos": "contraction",

121 "tags": ["abbreviation"],

122 },

123 "converb": {

124 "pos": "converb",

125 },

126 "counter": {

127 "pos": "counter",

128 },

129 "dependent noun": {

130 "pos": "noun",

131 "tags": [

132 "dependent",

133 ],

134 },

135 "definitions": {

136 # This is used under chinese characters

137 "pos": "character",

138 },

139 "determiner": {

140 "pos": "det",

141 },

142 "diacritical mark": {

143 "pos": "character",

144 "tags": ["diacritic"],

145 },

146 "enclitic": {

147 "pos": "suffix",

148 "tags": ["clitic"],

149 },

150 "enclitic particle": {

151 "pos": "suffix",

152 "tags": ["clitic"],

153 },

154 "gerund": {

155 "pos": "verb",

156 "debug": "part-of-speech Gerund is proscribed",

157 "tags": ["participle", "gerund"],

158 },

159 "han character": {

160 "pos": "character",

161 "tags": ["han"],

162 },

163 "han characters": {

164 "pos": "character",

165 "tags": ["han"],

166 "debug": "psually used in singular",

167 },

168 "hanja": {

169 "pos": "character",

170 "tags": ["Hanja"],

171 },

172 "hanzi": {

173 "pos": "character",

174 "tags": ["hanzi"],

175 },

176 "ideophone": {

177 "pos": "noun",

178 "tags": ["ideophone"],

179 },

180 "idiom": {

181 "pos": "phrase",

182 "tags": ["idiomatic"],

183 # This is too common for now to complain about

184 # "debug": "part-of-speech Idiom is proscribed",

185 },

186 "infix": {

187 "pos": "infix",

188 "tags": ["morpheme"],

189 },

190 "infinitive": {

191 "pos": "verb",

192 "debug": "part-of-speech Infinitive is proscribed",

193 "tags": ["infinitive"],

194 },

195 "initialism": {

196 "pos": "abbrev",

197 "debug": "part-of-speech Initialism is proscribed",

198 "tags": ["abbreviation"],

199 },

200 "interfix": {

201 "pos": "interfix",

202 "tags": ["morpheme"],

203 },

204 "interjection": {

205 "pos": "intj",

206 },

207 "interrogative pronoun": {

208 "pos": "pron",

209 "tags": ["interrogative"],

210 },

211 "intransitive verb": {

212 "pos": "verb",

213 "debug": "part-of-speech Intransitive verb is proscribed",

214 "tags": ["intransitive"],

215 },

216 "instransitive verb": {

217 "pos": "verb",

218 "tags": ["intransitive"],

219 "debug": "pisspelled subtitle",

220 },

221 "kanji": {

222 "pos": "character",

223 "tags": ["kanji"],

224 },

225 "letter": {

226 "pos": "character",

227 "tags": ["letter"],

228 },

229 "ligature": {

230 "pos": "character",

231 "tags": ["ligature"],

232 },

233 "nominal nuclear clause": {

234 "pos": "clause",

235 "debug": "part-of-speech Nominal nuclear clause is proscribed",

236 },

237 "νoun": {

238 "pos": "noun",

239 "debug": "misspelled subtitle",

240 },

241 "nouɲ": {

242 "pos": "noun",

243 "debug": "misspelled subtitle",

244 },

245 "noun": {

246 "pos": "noun",

247 },

248 "noun form": {

249 "pos": "noun",

250 "debug": "part-of-speech Noun form is proscribed",

251 },

252 "nouns": {

253 "pos": "noun",

254 "debug": "usually in singular",

255 },

256 "noum": {

257 "pos": "noun",

258 "debug": "misspelled subtitle",

259 },

260 "number": {

261 "pos": "num",

262 "tags": ["number"],

263 },

264 "numeral": {

265 "pos": "num",

266 },

267 "ordinal number": {

268 "pos": "adj",

269 "debug": "ordinal numbers should be adjectives",

270 "tags": ["ordinal"],

271 },

272 "participle": {

273 "pos": "verb",

274 "tags": ["participle"],

275 },

276 "particle": {

277 "pos": "particle",

278 # XXX Many of these seem to be prefixes or suffixes

279 },

280 "past participle": {

281 "pos": "verb",

282 "tags": ["participle", "past"],

283 },

284 "perfect expression": {

285 "pos": "verb",

286 },

287 "perfection expression": {

288 "pos": "verb",

289 },

290 "perfect participle": {

291 "pos": "verb",

292 "tags": ["participle", "perfect"],

293 },

294 "personal pronoun": {

295 "pos": "pron",

296 "tags": ["person"],

297 },

298 "phrase": {

299 "pos": "phrase",

300 },

301 "phrases": {

302 "pos": "phrase",

303 "debug": "usually used in singular",

304 },

305 "possessive determiner": {

306 "pos": "det",

307 "tags": ["possessive"],

308 },

309 "possessive pronoun": {

310 "pos": "det",

311 "tags": ["possessive"],

312 },

313 "postposition": {

314 "pos": "postp",

315 },

316 "predicative": {

317 "pos": "adj",

318 "tags": ["predicative"],

319 },

320 "prefix": {

321 "pos": "prefix",

322 "tags": ["morpheme"],

323 },

324 "preposition": {

325 "pos": "prep",

326 },

327 "prepositions": {

328 "pos": "prep",

329 "debug": "usually used in singular",

330 },

331 "prepositional expressions": {

332 "pos": "prep",

333 "debug": "part-of-speech Prepositional expressions is proscribed",

334 },

335 "prepositional phrase": {

336 "pos": "prep_phrase",

337 },

338 "prepositional pronoun": {

339 "pos": "pron",

340 "debug": "part-of-speech Prepositional pronoun is proscribed",

341 "tags": ["prepositional"],

342 },

343 "present participle": {

344 "pos": "verb",

345 "debug": "part-of-speech Present participle is proscribed",

346 "tags": ["participle", "present"],

347 },

348 "preverb": {

349 "pos": "preverb",

350 },

351 "pronoun": {

352 "pos": "pron",

353 },

354 "proper noun": {

355 "pos": "name",

356 },

357 "proper oun": {

358 "pos": "name",

359 "debug": "misspelled subtitle",

360 },

361 "proposition": {

362 "pos": "prep", # Appears to be a misspelling of preposition

363 "debug": "misspelled subtitle",

364 },

365 "proverb": {

366 "pos": "proverb",

367 },

368 "punctuation mark": {

369 "pos": "punct",

370 "tags": ["punctuation"],

371 },

372 "punctuation": {

373 "pos": "punct",

374 "debug": "part-of-speech Punctuation should be Punctuation mark",

375 "tags": ["punctuation"],

376 },

377 "relative": {

378 "pos": "conj",

379 "tags": ["relative"],

380 },

381 "romanization": {

382 "pos": "romanization",

383 },

384 "root": {

385 "pos": "root",

386 "tags": ["morpheme"],

387 },

388 "suffix": {

389 "pos": "suffix",

390 "tags": ["morpheme"],

391 },

392 "suffix form": {

393 "pos": "suffix",

394 "debug": "part-of-speech Suffix form is proscribed",

395 "tags": ["morpheme"],

396 },

397 "syllable": {

398 "pos": "syllable",

399 },

400 "symbol": {

401 "pos": "symbol",

402 },

403 "transitive verb": {

404 "pos": "verb",

405 "tags": ["transitive"],

406 },

407 "verb": {

408 "pos": "verb",

409 },

410 "verb form": {

411 "pos": "verb",

412 "debug": "part-of-speech Verb form is proscribed",

413 },

414 "verbal noun": {

415 "pos": "noun",

416 "tags": ["verbal"],

417 },

418 "verbs": {

419 "pos": "verb",

420 "debug": "usually in singular",

421 },

422}

423for k, v in part_of_speech_map.items():

424 if "tags" in v:

425 assert isinstance(v["tags"], (list, tuple))

426

427# Set of all possible parts-of-speech returned by wiktionary reading.

428PARTS_OF_SPEECH = set(x["pos"] for x in part_of_speech_map.values())

Coverage for src/wiktextract/extractor/en/parts_of_speech.py: 100%

7 statements