Coverage for src/wiktextract/extractor/en/parts_of_speech.py: 100%

7 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1# Definitions of extracted parts of speech codes and a mapping from 

2# Wiktionary section titles to parts of speech. 

3# 

4# Copyright (c) 2018-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org 

5 

6# This dictionary maps section titles in articles to parts-of-speech. There 

7# is a lot of variety and misspellings, and this tries to deal with those. 

8from typing import TypedDict 

9 

10POSMap = TypedDict( 

11 "POSMap", 

12 { 

13 "pos": str, 

14 "debug": str, 

15 "tags": list[str], 

16 }, 

17 total=False, 

18) 

19 

20part_of_speech_map: dict[str, POSMap] = { 

21 "abbreviation": { 

22 "pos": "abbrev", 

23 "debug": "part-of-speech Abbreviation is proscribed", 

24 "tags": ["abbreviation"], 

25 }, 

26 "acronym": { 

27 "pos": "abbrev", 

28 "debug": "part-of-speech Acronym is proscribed", 

29 "tags": ["abbreviation"], 

30 }, 

31 "adjectival": { 

32 "pos": "adj_noun", 

33 "debug": "part-of-speech Adjectival is not valid", 

34 }, 

35 "adjectival noun": { 

36 # Not listed as allowed, but common 

37 "pos": "adj_noun", 

38 }, 

39 "adjectival verb": { 

40 # Not listed as allowed, but common 

41 "pos": "adj_verb", 

42 }, 

43 "adjective": { 

44 "pos": "adj", 

45 }, 

46 "adjectuve": { 

47 "pos": "adj", 

48 "debug": "misspelled subtitle", 

49 }, 

50 "adjectives": { 

51 "pos": "adj", 

52 "debug": "usually used in singular", 

53 }, 

54 "adnominal": { 

55 "pos": "adnominal", 

56 }, 

57 "adverb": { 

58 "pos": "adv", 

59 }, 

60 "adverbs": { 

61 "pos": "adv", 

62 "debug": "usually used in singular", 

63 }, 

64 "adverbial phrase": { 

65 "pos": "adv_phrase", 

66 "debug": "part-of-speech Adverbial phrase is proscribed", 

67 }, 

68 "affix": { 

69 "pos": "affix", 

70 }, 

71 "adjective suffix": { 

72 "pos": "suffix", 

73 "debug": "part-of-speech Adjective suffix is proscribed", 

74 }, 

75 "ambiposition": { 

76 "pos": "ambiposition", 

77 }, 

78 "article": { 

79 "pos": "article", 

80 }, 

81 "character": { 

82 "pos": "character", 

83 }, 

84 "circumfix": { 

85 "pos": "circumfix", 

86 "tags": ["morpheme"], 

87 }, 

88 "circumposition": { 

89 "pos": "circumpos", 

90 }, 

91 "classifier": { 

92 "pos": "classifier", 

93 }, 

94 "clipping": { 

95 "pos": "abbrev", 

96 "debug": "part-of-speech Clipping is proscribed", 

97 "tags": ["abbreviation"], 

98 }, 

99 "clitic": { 

100 "pos": "suffix", 

101 "debug": "part-of-speech Clitic is proscribed", 

102 "tags": ["clitic"], 

103 }, 

104 "combining form": { 

105 "pos": "combining_form", 

106 "tags": ["morpheme"], 

107 }, 

108 "comparative": { 

109 "pos": "adj", 

110 "tags": ["comparative"], 

111 }, 

112 "conjunction": { 

113 "pos": "conj", 

114 }, 

115 "conjuntion": { 

116 "pos": "conj", 

117 "debug": "misspelled subtitle", 

118 }, 

119 "contraction": { 

120 "pos": "contraction", 

121 "tags": ["abbreviation"], 

122 }, 

123 "converb": { 

124 "pos": "converb", 

125 }, 

126 "counter": { 

127 "pos": "counter", 

128 }, 

129 "dependent noun": { 

130 "pos": "noun", 

131 "tags": [ 

132 "dependent", 

133 ], 

134 }, 

135 "definitions": { 

136 # This is used under chinese characters 

137 "pos": "character", 

138 }, 

139 "determiner": { 

140 "pos": "det", 

141 }, 

142 "diacritical mark": { 

143 "pos": "character", 

144 "tags": ["diacritic"], 

145 }, 

146 "enclitic": { 

147 "pos": "suffix", 

148 "tags": ["clitic"], 

149 }, 

150 "enclitic particle": { 

151 "pos": "suffix", 

152 "tags": ["clitic"], 

153 }, 

154 "gerund": { 

155 "pos": "verb", 

156 "debug": "part-of-speech Gerund is proscribed", 

157 "tags": ["participle", "gerund"], 

158 }, 

159 "han character": { 

160 "pos": "character", 

161 "tags": ["han"], 

162 }, 

163 "han characters": { 

164 "pos": "character", 

165 "tags": ["han"], 

166 "debug": "psually used in singular", 

167 }, 

168 "hanja": { 

169 "pos": "character", 

170 "tags": ["Hanja"], 

171 }, 

172 "hanzi": { 

173 "pos": "character", 

174 "tags": ["hanzi"], 

175 }, 

176 "ideophone": { 

177 "pos": "noun", 

178 "tags": ["ideophone"], 

179 }, 

180 "idiom": { 

181 "pos": "phrase", 

182 "tags": ["idiomatic"], 

183 # This is too common for now to complain about 

184 # "debug": "part-of-speech Idiom is proscribed", 

185 }, 

186 "infix": { 

187 "pos": "infix", 

188 "tags": ["morpheme"], 

189 }, 

190 "infinitive": { 

191 "pos": "verb", 

192 "debug": "part-of-speech Infinitive is proscribed", 

193 "tags": ["infinitive"], 

194 }, 

195 "initialism": { 

196 "pos": "abbrev", 

197 "debug": "part-of-speech Initialism is proscribed", 

198 "tags": ["abbreviation"], 

199 }, 

200 "interfix": { 

201 "pos": "interfix", 

202 "tags": ["morpheme"], 

203 }, 

204 "interjection": { 

205 "pos": "intj", 

206 }, 

207 "interrogative pronoun": { 

208 "pos": "pron", 

209 "tags": ["interrogative"], 

210 }, 

211 "intransitive verb": { 

212 "pos": "verb", 

213 "debug": "part-of-speech Intransitive verb is proscribed", 

214 "tags": ["intransitive"], 

215 }, 

216 "instransitive verb": { 

217 "pos": "verb", 

218 "tags": ["intransitive"], 

219 "debug": "pisspelled subtitle", 

220 }, 

221 "kanji": { 

222 "pos": "character", 

223 "tags": ["kanji"], 

224 }, 

225 "letter": { 

226 "pos": "character", 

227 "tags": ["letter"], 

228 }, 

229 "ligature": { 

230 "pos": "character", 

231 "tags": ["ligature"], 

232 }, 

233 "nominal nuclear clause": { 

234 "pos": "clause", 

235 "debug": "part-of-speech Nominal nuclear clause is proscribed", 

236 }, 

237 "νoun": { 

238 "pos": "noun", 

239 "debug": "misspelled subtitle", 

240 }, 

241 "nouɲ": { 

242 "pos": "noun", 

243 "debug": "misspelled subtitle", 

244 }, 

245 "noun": { 

246 "pos": "noun", 

247 }, 

248 "noun form": { 

249 "pos": "noun", 

250 "debug": "part-of-speech Noun form is proscribed", 

251 }, 

252 "nouns": { 

253 "pos": "noun", 

254 "debug": "usually in singular", 

255 }, 

256 "noum": { 

257 "pos": "noun", 

258 "debug": "misspelled subtitle", 

259 }, 

260 "number": { 

261 "pos": "num", 

262 "tags": ["number"], 

263 }, 

264 "numeral": { 

265 "pos": "num", 

266 }, 

267 "ordinal number": { 

268 "pos": "adj", 

269 "debug": "ordinal numbers should be adjectives", 

270 "tags": ["ordinal"], 

271 }, 

272 "participle": { 

273 "pos": "verb", 

274 "tags": ["participle"], 

275 }, 

276 "particle": { 

277 "pos": "particle", 

278 # XXX Many of these seem to be prefixes or suffixes 

279 }, 

280 "past participle": { 

281 "pos": "verb", 

282 "tags": ["participle", "past"], 

283 }, 

284 "perfect expression": { 

285 "pos": "verb", 

286 }, 

287 "perfection expression": { 

288 "pos": "verb", 

289 }, 

290 "perfect participle": { 

291 "pos": "verb", 

292 "tags": ["participle", "perfect"], 

293 }, 

294 "personal pronoun": { 

295 "pos": "pron", 

296 "tags": ["person"], 

297 }, 

298 "phrase": { 

299 "pos": "phrase", 

300 }, 

301 "phrases": { 

302 "pos": "phrase", 

303 "debug": "usually used in singular", 

304 }, 

305 "possessive determiner": { 

306 "pos": "det", 

307 "tags": ["possessive"], 

308 }, 

309 "possessive pronoun": { 

310 "pos": "det", 

311 "tags": ["possessive"], 

312 }, 

313 "postposition": { 

314 "pos": "postp", 

315 }, 

316 "predicative": { 

317 "pos": "adj", 

318 "tags": ["predicative"], 

319 }, 

320 "prefix": { 

321 "pos": "prefix", 

322 "tags": ["morpheme"], 

323 }, 

324 "preposition": { 

325 "pos": "prep", 

326 }, 

327 "prepositions": { 

328 "pos": "prep", 

329 "debug": "usually used in singular", 

330 }, 

331 "prepositional expressions": { 

332 "pos": "prep", 

333 "debug": "part-of-speech Prepositional expressions is proscribed", 

334 }, 

335 "prepositional phrase": { 

336 "pos": "prep_phrase", 

337 }, 

338 "prepositional pronoun": { 

339 "pos": "pron", 

340 "debug": "part-of-speech Prepositional pronoun is proscribed", 

341 "tags": ["prepositional"], 

342 }, 

343 "present participle": { 

344 "pos": "verb", 

345 "debug": "part-of-speech Present participle is proscribed", 

346 "tags": ["participle", "present"], 

347 }, 

348 "preverb": { 

349 "pos": "preverb", 

350 }, 

351 "pronoun": { 

352 "pos": "pron", 

353 }, 

354 "proper noun": { 

355 "pos": "name", 

356 }, 

357 "proper oun": { 

358 "pos": "name", 

359 "debug": "misspelled subtitle", 

360 }, 

361 "proposition": { 

362 "pos": "prep", # Appears to be a misspelling of preposition 

363 "debug": "misspelled subtitle", 

364 }, 

365 "proverb": { 

366 "pos": "proverb", 

367 }, 

368 "punctuation mark": { 

369 "pos": "punct", 

370 "tags": ["punctuation"], 

371 }, 

372 "punctuation": { 

373 "pos": "punct", 

374 "debug": "part-of-speech Punctuation should be Punctuation mark", 

375 "tags": ["punctuation"], 

376 }, 

377 "relative": { 

378 "pos": "conj", 

379 "tags": ["relative"], 

380 }, 

381 "romanization": { 

382 "pos": "romanization", 

383 }, 

384 "root": { 

385 "pos": "root", 

386 "tags": ["morpheme"], 

387 }, 

388 "suffix": { 

389 "pos": "suffix", 

390 "tags": ["morpheme"], 

391 }, 

392 "suffix form": { 

393 "pos": "suffix", 

394 "debug": "part-of-speech Suffix form is proscribed", 

395 "tags": ["morpheme"], 

396 }, 

397 "syllable": { 

398 "pos": "syllable", 

399 }, 

400 "symbol": { 

401 "pos": "symbol", 

402 }, 

403 "transitive verb": { 

404 "pos": "verb", 

405 "tags": ["transitive"], 

406 }, 

407 "verb": { 

408 "pos": "verb", 

409 }, 

410 "verb form": { 

411 "pos": "verb", 

412 "debug": "part-of-speech Verb form is proscribed", 

413 }, 

414 "verbal noun": { 

415 "pos": "noun", 

416 "tags": ["verbal"], 

417 }, 

418 "verbs": { 

419 "pos": "verb", 

420 "debug": "usually in singular", 

421 }, 

422} 

423for k, v in part_of_speech_map.items(): 

424 if "tags" in v: 

425 assert isinstance(v["tags"], (list, tuple)) 

426 

427# Set of all possible parts-of-speech returned by wiktionary reading. 

428PARTS_OF_SPEECH = set(x["pos"] for x in part_of_speech_map.values())