Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 82%

41 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-18 10:14 +0000

1# Language-specific configuration for various aspects of inflection table 

2# parsing. 

3 

4import re 

5from typing import Optional, TypedDict, Union 

6 

7from ...tags import valid_tags 

8from .parts_of_speech import PARTS_OF_SPEECH 

9 

10LangConfDict = TypedDict( 

11 "LangConfDict", 

12 { 

13 "next": str, 

14 "hdr_expand_first": set[str], 

15 "hdr_expand_cont": set[str], 

16 "animate_inanimate_remove": bool, 

17 "both_active_passive_remove": bool, 

18 "both_strong_weak_remove": bool, 

19 "definitenesses": list[str], 

20 "empty_row_resets": bool, 

21 "form_transformations": list[ 

22 list[tuple[str, ...] | str] 

23 ], # tag extraction, lang_specific_tags() 

24 "genders": Optional[list[str]], 

25 "imperative_no_tense": bool, 

26 "masc_only_animate": bool, # Slavic special 

27 "numbers": list[str], 

28 "persons": list[str], 

29 "pl_virile_nonvirile": bool, 

30 "reuse_cellspan": str, # stop/skip/reuse 

31 "skip_mood_mood": bool, 

32 "skip_tense_tense": bool, 

33 "stop_non_finite_non_finite": bool, 

34 "stop_non_finite_voice": bool, 

35 "stop_non_finite_tense": bool, 

36 "strengths": list[str], 

37 "virile_nonvirile_remove": bool, 

38 "voices": list[str], 

39 "special_phrase_splits": dict[ 

40 str, list[Union[list[str], str]] 

41 ], # value: (split phrase, tags) 

42 "form_replacements": dict[ 

43 str, Union[str, list[str]] 

44 ], # value: [replacement, tags] 

45 # Greek-style bracket semantics 

46 "parentheses_for_informal": bool, 

47 "square_brackets_for_rare": bool, 

48 "curly_brackets_for_archaic": bool, 

49 # Armenian; migrated old data here 

50 "lang_tag_mappings": Optional[ 

51 dict[str, dict[tuple[str, ...], list[str]]] 

52 ], 

53 # Spanish has a lot of "vos" and "tú" in its tables that look like 

54 # references, and they give their form certain tags. 

55 # Dict of references ("vos") that point to tag strings "first-person 

56 # singular" that *extend* tags. 

57 "special_references": Optional[dict[str, str]], 

58 # Some languages like Icelandic and Faroese have text cells in the 

59 # upper left that we'd like to ignore. 

60 "ignore_top_left_text_cell": bool, 

61 # Minor regex replacements for cleanup in parse_simple_table() 

62 "minor_text_cleanups": Optional[ 

63 dict[str, str] 

64 ], # dict of {regex: substitution} 

65 "articles_in_separate_columns": bool, 

66 # Cells to ignore in this language, unless the cell has the key 

67 # as a tag. 

68 "conditionally_ignored_cells": dict[str, list[str]], 

69 }, 

70 total=False, 

71) 

72 

73lang_specific: dict[str, LangConfDict] = { 

74 "default": { 

75 "hdr_expand_first": set( 

76 [ 

77 "number", 

78 "mood", 

79 "referent", 

80 "aspect", 

81 "tense", 

82 "voice", 

83 "non-finite", 

84 "case", 

85 "possession", 

86 ] 

87 ), 

88 "hdr_expand_cont": set( 

89 [ 

90 "person", 

91 "gender", 

92 "number", 

93 "degree", 

94 "polarity", 

95 "voice", 

96 "misc", 

97 ] 

98 ), 

99 "animate_inanimate_remove": True, 

100 "both_active_passive_remove": True, 

101 "both_strong_weak_remove": True, 

102 "definitenesses": ["indefinite", "definite"], 

103 "empty_row_resets": False, 

104 "form_transformations": [], # tag extraction, lang_specific_tags() 

105 "genders": None, 

106 "imperative_no_tense": False, 

107 "masc_only_animate": False, # Slavic special 

108 "numbers": ["singular", "plural"], 

109 "persons": ["first-person", "second-person", "third-person"], 

110 "pl_virile_nonvirile": False, 

111 "reuse_cellspan": "skip", # stop/skip/reuse 

112 "skip_mood_mood": False, 

113 "skip_tense_tense": False, 

114 "stop_non_finite_non_finite": True, 

115 "stop_non_finite_voice": False, 

116 "stop_non_finite_tense": False, 

117 "strengths": ["strong", "weak"], 

118 "virile_nonvirile_remove": True, 

119 "voices": ["active", "passive"], 

120 "special_phrase_splits": {}, # value: (split phrase, tags) 

121 "form_replacements": {}, # value: [replacement, tags] 

122 # Greek-style bracket semantics 

123 "parentheses_for_informal": False, 

124 "square_brackets_for_rare": False, 

125 "curly_brackets_for_archaic": False, 

126 # Armenian; migrated old data here 

127 "lang_tag_mappings": None, 

128 # Spanish has a lot of "vos" and "tú" in its tables that look like 

129 # references, and they give their form certain tags. 

130 # Dict of references ("vos") that point to tag strings "first-person 

131 # singular" that *extend* tags. 

132 "special_references": None, 

133 # Some languages like Icelandic and Faroese have text cells in the 

134 # upper left that we'd like to ignore. 

135 "ignore_top_left_text_cell": False, 

136 # Minor regex replacements for cleanup in parse_simple_table() 

137 "minor_text_cleanups": None, # dict of {regex: substitution} 

138 "articles_in_separate_columns": False, 

139 # Cells to ignore in this language, unless the cell has the key 

140 # as a tag. 

141 "conditionally_ignored_cells": {}, 

142 }, 

143 "austronesian-group": { 

144 "numbers": ["singular", "dual", "plural"], 

145 }, 

146 "bantu-group": { 

147 "genders": None, 

148 }, 

149 "indo-european-group": { 

150 "genders": ["masculine", "feminine", "neuter"], 

151 "numbers": ["singular", "plural"], 

152 }, 

153 "romance-group": {}, 

154 "slavic-group": { 

155 "numbers": ["singular", "plural", "dual"], 

156 "masc_only_animate": True, 

157 }, 

158 "samojedic-group": { 

159 "next": "uralic-group", 

160 }, 

161 "semitic-group": { 

162 "numbers": ["singular", "dual", "plural"], 

163 "definitenesses": ["indefinite", "definite", "construct"], 

164 }, 

165 "uralic-group": { 

166 "numbers": ["singular", "dual", "plural"], 

167 }, 

168 "german-group": { # languages closely related to or offshot from German 

169 "next": "germanic-group", 

170 "articles_in_separate_columns": True, 

171 }, 

172 "germanic-group": { # Germanic languages as a whole 

173 "next": "indo-european-group", 

174 }, 

175 "Akkadian": { 

176 "next": "semitic-group", 

177 }, 

178 "Alemannic German": { 

179 "next": "German", 

180 }, 

181 "Amharic": { 

182 "next": "semitic-group", 

183 }, 

184 "Ancient Greek": { 

185 "next": "Proto-Indo-European", # Has dual 

186 "form_transformations": [ 

187 # Used to remove the gendered article alternatives at the start 

188 # of table entries like ἰχθυοκένταυρος / Ancient Greek 

189 [("noun", "name"), "^ὁ, ἡ ", "", ""], 

190 [("noun", "name"), "^τὼ ", "", ""], 

191 [("noun", "name"), "^οἱ, αἱ ", "", ""], 

192 [("noun", "name"), "^τοῦ, τῆς ", "", ""], 

193 [("noun", "name"), "^τοῖν ", "", ""], 

194 [("noun", "name"), "^τῶν ", "", ""], 

195 [("noun", "name"), "^τῷ, τῇ ", "", ""], 

196 [("noun", "name"), "^τοῖς, ταῖς ", "", ""], 

197 [("noun", "name"), "^τὸν, τὴν ", "", ""], 

198 [("noun", "name"), "^τὼ ", "", ""], 

199 [("noun", "name"), "^τοὺς, τᾱ̀ς ", "", ""], 

200 [("noun", "name"), "(?m)^ho, hē ", "", ""], 

201 [("noun", "name"), "(?m)^tṑ ", "", ""], 

202 [("noun", "name"), "(?m)^hoi, hai ", "", ""], 

203 [("noun", "name"), "(?m)^toû, tês ", "", ""], 

204 [("noun", "name"), "(?m)^toîn ", "", ""], 

205 [("noun", "name"), "(?m)^tôn ", "", ""], 

206 [("noun", "name"), "(?m)^tôi, têi ", "", ""], 

207 [("noun", "name"), "(?m)^toîs, taîs ", "", ""], 

208 [("noun", "name"), "(?m)^tòn, tḕn ", "", ""], 

209 [("noun", "name"), "(?m)^tṑ ", "", ""], 

210 [("noun", "name"), "(?m)^toùs, tā̀s ", "", ""], 

211 # New added ones, leaving the old ones just in case 

212 [("noun", "name"), r"^ὁ ", "", ""], 

213 [("noun", "name"), r"(?m)^ho ", "", ""], 

214 [("noun", "name"), r"^ἡ ", "", ""], 

215 [("noun", "name"), r"(?m)^hē ", "", ""], 

216 [("noun", "name"), r"^αἱ ", "", ""], 

217 [("noun", "name"), r"(?m)^hai ", "", ""], 

218 [("noun", "name"), r"^τῆς ", "", ""], 

219 [("noun", "name"), r"(?m)^tês ", "", ""], 

220 [("noun", "name"), r"^τῇ ", "", ""], 

221 [("noun", "name"), r"(?m)^tēî ", "", ""], 

222 [("noun", "name"), r"(?m)^têi ", "", ""], 

223 [("noun", "name"), r"^ταῖς ", "", ""], 

224 [("noun", "name"), r"(?m)^taîs ", "", ""], 

225 [("noun", "name"), r"^τὴν ", "", ""], 

226 [("noun", "name"), r"(?m)^tḕn ", "", ""], 

227 [("noun", "name"), r"^τᾱ̀ς ", "", ""], 

228 [("noun", "name"), r"(?m)^tā̀s ", "", ""], 

229 [("noun", "name"), r"^ὁ / ἡ ", "", ""], 

230 [("noun", "name"), r"(?m)^ho / hē ", "", ""], 

231 [("noun", "name"), r"^οἱ / αἱ ", "", ""], 

232 [("noun", "name"), r"(?m)^hoi / hai ", "", ""], 

233 [("noun", "name"), r"^τοῦ / τῆς ", "", ""], 

234 [("noun", "name"), r"(?m)^toû / tês ", "", ""], 

235 [("noun", "name"), r"^τῷ / τῇ ", "", ""], 

236 [("noun", "name"), r"(?m)^tōî / tēî ", "", ""], 

237 [("noun", "name"), r"^τοῖς / ταῖς ", "", ""], 

238 [("noun", "name"), r"(?m)^toîs / taîs ", "", ""], 

239 [("noun", "name"), r"^τὸν / τὴν ", "", ""], 

240 [("noun", "name"), r"(?m)^tòn / tḕn ", "", ""], 

241 [("noun", "name"), r"^τοὺς / τᾱ̀ς ", "", ""], 

242 [("noun", "name"), r"(?m)^toùs / tā̀s ", "", ""], 

243 [("noun", "name"), r"^οἱ ", "", ""], 

244 [("noun", "name"), r"(?m)^hoi ", "", ""], 

245 [("noun", "name"), r"^τοῦ ", "", ""], 

246 [("noun", "name"), r"(?m)^toû ", "", ""], 

247 [("noun", "name"), r"^τῷ ", "", ""], 

248 [("noun", "name"), r"(?m)^tôi ", "", ""], 

249 [("noun", "name"), r"^τοῖς ", "", ""], 

250 [("noun", "name"), r"(?m)^toîs ", "", ""], 

251 [("noun", "name"), r"^τὸν ", "", ""], 

252 [("noun", "name"), r"(?m)^tòn ", "", ""], 

253 [("noun", "name"), r"^τοὺς ", "", ""], 

254 [("noun", "name"), r"(?m)^toùs ", "", ""], 

255 [("noun", "name"), r"^τὸ ", "", ""], 

256 [("noun", "name"), r"(?m)^tò ", "", ""], 

257 [("noun", "name"), r"^τᾰ̀ ", "", ""], 

258 [("noun", "name"), r"(?m)^tằ ", "", ""], 

259 [("noun", "name"), r"^τοῖσῐ / τοῖσῐν ", "", ""], 

260 # XXX THIS IS BAD, IF POSSIBLE FIX, ISSUE #1313 

261 [("noun", "name"), r"(?m)^toîsĭ\(n\) ", "", ""], 

262 [("noun", "name"), r"(?m)^toîsĭ ", "", ""], 

263 [("noun", "name"), r"(?m)^toîsĭn ", "", ""], 

264 # END BAD 

265 [("noun", "name"), r"^ᾱ̔ ", "", ""], 

266 [("noun", "name"), r"(?m)^hā ", "", ""], 

267 [("noun", "name"), r"^ταὶ ", "", ""], 

268 [("noun", "name"), r"(?m)^taì ", "", ""], 

269 [("noun", "name"), r"^τᾶς ", "", ""], 

270 [("noun", "name"), r"(?m)^tâs ", "", ""], 

271 [("noun", "name"), r"^τᾶν ", "", ""], 

272 [("noun", "name"), r"(?m)^tân ", "", ""], 

273 [("noun", "name"), r"^τᾷ ", "", ""], 

274 [("noun", "name"), r"(?m)^tâi ", "", ""], 

275 [("noun", "name"), r"^τᾱν ", "", ""], 

276 [("noun", "name"), r"(?m)^tān ", "", ""], 

277 [("noun", "name"), r"^τοὶ ", "", ""], 

278 [("noun", "name"), r"(?m)^toì ", "", ""], 

279 [("noun", "name"), r"^τῇσῐ ", "", ""], 

280 [("noun", "name"), r"(?m)^têisĭ ", "", ""], 

281 [("noun", "name"), r"^τῇσῐν ", "", ""], 

282 [("noun", "name"), r"(?m)^têisĭ ", "", ""], 

283 [("noun", "name"), r"^ὀ ", "", ""], 

284 [("noun", "name"), r"(?m)^o ", "", ""], 

285 [("noun", "name"), r"^οἰ ", "", ""], 

286 [("noun", "name"), r"(?m)^oi ", "", ""], 

287 [("noun", "name"), r"^τῶ ", "", ""], 

288 [("noun", "name"), r"(?m)^tô ", "", ""], 

289 [("noun", "name"), r"^ᾱ ", "", ""], 

290 [("noun", "name"), r"(?m)^ā ", "", ""], 

291 [("noun", "name"), r"^αἰ ", "", ""], 

292 [("noun", "name"), r"(?m)^ai ", "", ""], 

293 [("noun", "name"), r"^τᾶ ", "", ""], 

294 [("noun", "name"), r"(?m)^tâ ", "", ""], 

295 [("noun", "name"), r"^τᾱν ", "", ""], 

296 [("noun", "name"), r"(?m)^tān ", "", ""], 

297 [ 

298 ("noun", "name"), 

299 r"^τοῖ ", 

300 "", 

301 "", 

302 ], # alternative suggested by user 

303 [("noun", "name"), r"^τοι ", "", ""], 

304 [("noun", "name"), r"(?m)^toi ", "", ""], 

305 [("noun", "name"), r"^τὼς ", "", ""], 

306 [("noun", "name"), r"(?m)^tṑs ", "", ""], 

307 [("noun", "name"), r"^τὸς ", "", ""], 

308 [("noun", "name"), r"(?m)^tòs ", "", ""], 

309 [("noun", "name"), r"^τὼς ", "", ""], 

310 [("noun", "name"), r"(?m)^tṑs ", "", ""], 

311 # [("noun", "name"), r"^", "", ""], 

312 # [("noun", "name"), r"(?m)^", "", ""], 

313 ], 

314 }, 

315 # "Anejom̃": { 

316 # "numbers": ["singular", "dual", "trial", "plural"], 

317 # }, 

318 "Arabic": { 

319 "next": "semitic-group", 

320 "numbers": [ 

321 "singular", 

322 "dual", 

323 "paucal", 

324 "plural", 

325 "collective", 

326 "singulative", 

327 ], 

328 "reuse_cellspan": "reuse", 

329 "hdr_expand_first": set(["number"]), 

330 "hdr_expand_cont": set( 

331 ["gender", "referent", "misc", "number", "class"] 

332 ), 

333 }, 

334 "Aragonese": { 

335 "next": "romance-group", 

336 }, 

337 "Armenian": { 

338 "lang_tag_mappings": { 

339 "noun": { 

340 ("possessive", "singular"): ["possessive", "possessed-single"], 

341 ("possessive", "plural"): ["possessive", "possessed-single"], 

342 }, 

343 }, 

344 }, 

345 "Aromanian": { 

346 "next": "romance-group", 

347 }, 

348 "Aramaic": { 

349 "next": "semitic-group", 

350 }, 

351 "Avestan": { 

352 "next": "Proto-Indo-European", 

353 }, 

354 "Bavarian": { 

355 "next": "German", 

356 }, 

357 "Baiso": { 

358 "numbers": ["singular", "paucal", "plural"], 

359 }, 

360 "Belarusian": { 

361 "next": "slavic-group", 

362 }, 

363 "Bende": { 

364 "next": "bantu-group", 

365 }, 

366 # "Berber": { 

367 # "definitenesses": ["indefinite", "definite", "construct"], 

368 # }, 

369 "Catalan": { 

370 "next": "romance-group", 

371 }, 

372 "Chichewa": { 

373 "next": "bantu-group", 

374 }, 

375 "Chimwiini": { 

376 "next": "bantu-group", 

377 }, 

378 "Cimbrian": { 

379 "next": "German", 

380 }, 

381 "Corsican": { 

382 "next": "romance-group", 

383 }, 

384 "Czech": { 

385 "next": "slavic-group", 

386 "hdr_expand_first": set(["tense", "mood", "non-finite"]), 

387 "hdr_expand_cont": set(["tense", "mood", "voice"]), 

388 }, 

389 "Dalmatian": { 

390 "next": "romance-group", 

391 }, 

392 "Danish": { 

393 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

394 "form_transformations": [ 

395 ["noun", r"^\(as a measure\) ", "", ""], 

396 ], 

397 }, 

398 "Eblaite": { 

399 "next": "semitic-group", 

400 }, 

401 "Egyptian": { 

402 "definitenesses": ["indefinite", "definite", "construct"], 

403 }, 

404 "Emilian": { 

405 "next": "romance-group", 

406 }, 

407 "English": { 

408 "stop_non_finite_tense": True, # affect/English/Verb 

409 "form_transformations": [ 

410 ["verb", r"^\(to\) ", "", ""], 

411 ["verb", "^to ", "", ""], 

412 ["verb", r"^I ", "", "first-person singular"], 

413 ["verb", r"^you ", "", "second-person"], 

414 ["verb", r"^he ", "", "third-person singular"], 

415 ["verb", r"^we ", "", "first-person plural"], 

416 ["verb", r"^they ", "", "third-person"], 

417 ["verb", r"^it ", "", "third-person singular"], 

418 ["verb", r"^thou ", "", "second-person singular"], 

419 ["verb", r"^ye ", "", "second-person plural"], 

420 ["verb", r" \(thou\)$", "", "second-person singular"], 

421 ["verb", r" \(ye\)$", "", "second-person plural"], 

422 ["verb", r"^he/she/it ", "", "third-person singular"], 

423 ["verb", r"^he/she/it/they ", "", "third-person singular"], 

424 ["verb", r"\bhim/her/it/them ", "", "third-person singular"], 

425 ["verb", r"\bthem ", "", "third-person"], 

426 ["verb", r"\bus ", "", "first-person plural"], 

427 ["verb", r"\bme ", "", "first-person singular"], 

428 ], 

429 "form_replacements": { 

430 "let’s be": ["let's be", "first-person plural pronoun-included"], 

431 }, 

432 "special_phrase_splits": { 

433 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"], 

434 "we are (’re)/be/been": [ 

435 ["are (’re)", "be", "been"], 

436 "first-person plural", 

437 ], 

438 "thou art (’rt)/beest": [ 

439 ["art (’rt)", "beest"], 

440 "second-person singular", 

441 ], 

442 "ye are (’re)/be/been": [ 

443 ["are (’re)", "be", "been"], 

444 "second-person plural", 

445 ], 

446 "thou be/beest": [["be", "beest"], "second-person singular"], 

447 "he/she/it is (’s)/beeth/bes": [ 

448 ["is (’s)", "beeth", "bes"], 

449 "third-person singular", 

450 ], 

451 "they are (’re)/be/been": [ 

452 ["are (’re)", "be", "been"], 

453 "third-person plural", 

454 ], 

455 "thou wert/wast": [["wert", "wast"], "second-person singular"], 

456 "thou were/wert": [["were", "wert"], "second-person singular"], 

457 "there has been": [["there has been"], "singular"], 

458 "there have been": [["there have been"], "plural"], 

459 "there is ('s)": [["there is", "there's"], "singular"], 

460 "there are ('re)": [["there are", "there're"], "plural"], 

461 "there was": [["there was"], "singular"], 

462 "there were": [["there were"], "plural"], 

463 }, 

464 }, 

465 "Estonian": { 

466 "hdr_expand_first": set(["non-finite"]), 

467 "hdr_expand_cont": set(["voice"]), 

468 }, 

469 "Faroese": { 

470 "ignore_top_left_text_cell": True, 

471 }, 

472 "Fijian": { 

473 "numbers": ["singular", "paucal", "plural"], 

474 }, 

475 "Finnish": { 

476 "hdr_expand_first": set([]), 

477 }, 

478 "French": { 

479 "next": "romance-group", 

480 }, 

481 "Friulian": { 

482 "next": "romance-group", 

483 }, 

484 "Galician": { 

485 "next": "romance-group", 

486 }, 

487 "German": { 

488 "next": "german-group", 

489 "form_transformations": [ 

490 ["verb", "^ich ", "", "first-person singular"], 

491 ["verb", "^du ", "", "second-person singular"], 

492 ["verb", "^er ", "", "third-person singular"], 

493 ["verb", "^wir ", "", "first-person plural"], 

494 ["verb", "^ihr ", "", "second-person plural"], 

495 ["verb", "^sie ", "", "third-person plural"], 

496 [ 

497 "verb", 

498 "^dass ich ", 

499 "", 

500 "first-person singular subordinate-clause", 

501 ], 

502 [ 

503 "verb", 

504 "^dass du ", 

505 "", 

506 "second-person singular subordinate-clause", 

507 ], 

508 [ 

509 "verb", 

510 "^dass er ", 

511 "", 

512 "third-person singular subordinate-clause", 

513 ], 

514 [ 

515 "verb", 

516 "^dass wir ", 

517 "", 

518 "first-person plural subordinate-clause", 

519 ], 

520 [ 

521 "verb", 

522 "^dass ihr ", 

523 "", 

524 "second-person plural subordinate-clause", 

525 ], 

526 [ 

527 "verb", 

528 "^dass sie ", 

529 "", 

530 "third-person plural subordinate-clause", 

531 ], 

532 ["verb", r" \(du\)$", "", "second-person singular"], 

533 ["verb", r" \(ihr\)$", "", "second-person plural"], 

534 ["adj", "^er ist ", "", "masculine singular"], 

535 ["adj", "^sie ist ", "", "feminine singular"], 

536 ["adj", "^es ist ", "", "neuter singular"], 

537 ["adj", "^sie sind ", "", "plural"], 

538 ["adj", "^keine ", "keine ", "negative"], 

539 ["adj", "^keiner ", "keiner ", "negative"], 

540 ["adj", "^keinen ", "keinen ", "negative"], 

541 ], 

542 "conditionally_ignored_cells": { 

543 "definite": [ 

544 "der", 

545 "die", 

546 "das", 

547 "des", 

548 "dem", 

549 "den", 

550 ], 

551 "indefinite": [ 

552 "ein", 

553 "eine", 

554 "eines", 

555 "einer", 

556 "einem", 

557 "einen", 

558 ], 

559 "negative": [ 

560 "kein", 

561 "keine", 

562 "keiner", 

563 "keinen", 

564 ], 

565 }, 

566 }, 

567 "German Low German": { 

568 "next": "German", 

569 "hdr_expand_first": set(["mood", "non-finite"]), 

570 "hdr_expand_cont": set(["tense"]), 

571 }, 

572 "Gothic": { 

573 "next": "Proto-Indo-European", # Has dual 

574 }, 

575 "Greek": { 

576 "next": "indo-european-group", 

577 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]), 

578 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]), 

579 "imperative_no_tense": True, 

580 "reuse_cellspan": "reuse", 

581 "skip_mood_mood": True, 

582 "skip_tense_tense": True, 

583 # είμαι/Greek 

584 "parentheses_for_informal": True, 

585 "square_brackets_for_rare": True, 

586 "curly_brackets_for_archaic": True, 

587 # For greek originally 

588 "minor_text_cleanups": { 

589 r"\s+➤\s*$": "", 

590 }, 

591 }, 

592 "Hawaiian": { 

593 "next": "austronesian-group", 

594 }, 

595 "Hebrew": { 

596 "next": "semitic-group", 

597 }, 

598 "Hijazi Arabic": { 

599 "next": "semitic-group", 

600 }, 

601 "Hopi": { 

602 "numbers": ["singular", "paucal", "plural"], 

603 }, 

604 "Hungarian": { 

605 "hdr_expand_first": set([]), 

606 "hdr_expand_cont": set([]), 

607 }, 

608 "Hunsrik": { 

609 "next": "German", 

610 }, 

611 "Icelandic": { 

612 "ignore_top_left_text_cell": True, 

613 }, 

614 "Ilokano": { 

615 "next": "austronesian-group", 

616 }, 

617 "Inari Sami": { 

618 "next": "samojedic-group", 

619 }, 

620 "Inuktitut": { 

621 "numbers": ["singular", "dual", "plural"], 

622 }, 

623 "Italian": { 

624 "next": "romance-group", 

625 "hdr_expand_first": set(["mood", "tense"]), 

626 "hdr_expand_cont": set(["person", "register", "number", "misc"]), 

627 "form_transformations": [ 

628 ["verb", "^non ", "", "negative"], 

629 ], 

630 }, 

631 "Irish": { 

632 "next": "Old Irish", 

633 "genders": ["masculine", "feminine"], 

634 }, 

635 "Kamba": { 

636 "next": "bantu-group", 

637 }, 

638 "Kapampangan": { 

639 "next": "austronesian-group", 

640 }, 

641 # "Khoe": { 

642 # "numbers": ["singular", "dual", "plural"], 

643 # }, 

644 "Kikuyu": { 

645 "next": "bantu-group", 

646 }, 

647 "Ladin": { 

648 "next": "romance-group", 

649 }, 

650 # "Larike": { 

651 # "numbers": ["singular", "dual", "trial", "plural"], 

652 # }, 

653 "Latin": { 

654 "next": "romance-group", 

655 "stop_non_finite_voice": True, 

656 }, 

657 "Latvian": { 

658 "empty_row_resets": True, 

659 }, 

660 "Ligurian": { 

661 "next": "romance-group", 

662 }, 

663 "Lihir": { 

664 "numbers": ["singular", "dual", "trial", "paucal", "plural"], 

665 }, 

666 "Lingala": { 

667 "next": "bantu-group", 

668 }, 

669 "Lombard": { 

670 "next": "romance-group", 

671 }, 

672 "Lower Sorbian": { 

673 "next": "slavic-group", 

674 }, 

675 "Luganda": { 

676 "next": "bantu-group", 

677 }, 

678 "Lule Sami": { 

679 "next": "samojedic-group", 

680 }, 

681 "Luxembourgish": { 

682 "next": "German", 

683 }, 

684 "Maltese": { 

685 "next": "semitic-group", 

686 }, 

687 "Maore Comorian": { 

688 "next": "bantu-group", 

689 }, 

690 "Masaba": { 

691 "next": "bantu-group", 

692 }, 

693 "Mirandese": { 

694 "next": "romance-group", 

695 }, 

696 "Moroccan Arabic": { 

697 "next": "semitic-group", 

698 }, 

699 # "Motuna": { 

700 # "numbers": ["singular", "paucal", "plural"], 

701 # }, 

702 "Mwali Comorian": { 

703 "next": "bantu-group", 

704 }, 

705 "Mwani": { 

706 "next": "bantu-group", 

707 }, 

708 "Navajo": { 

709 "numbers": [ 

710 "singular", 

711 "plural", 

712 "dual", 

713 "duoplural", 

714 ], 

715 }, 

716 "Neapolitan": { 

717 "next": "romance-group", 

718 }, 

719 "Nenets": { 

720 "next": "uralic-group", 

721 }, 

722 "Ngazidja Comorian": { 

723 "next": "bantu-group", 

724 }, 

725 "Niuean": { 

726 "next": "austronesian-group", 

727 }, 

728 "Northern Kurdish": { 

729 "numbers": ["singular", "paucal", "plural"], 

730 }, 

731 "Northern Ndebele": { 

732 "next": "bantu-group", 

733 }, 

734 "Northern Sami": { 

735 "next": "samojedic-group", 

736 }, 

737 # "Mussau": { 

738 # "numbers": ["singular", "dual", "trial", "plural"], 

739 # }, 

740 "Nyankole": { 

741 "next": "bantu-group", 

742 }, 

743 "Occitan": { 

744 "next": "romance-group", 

745 }, 

746 "Old Church Slavonic": { 

747 "next": "Proto-Indo-European", # Has dual 

748 }, 

749 "Old English": { 

750 "next": "Proto-Indo-European", # Had dual in pronouns 

751 }, 

752 "Old Norse": { 

753 "next": "Proto-Indo-European", # Had dual in pronouns 

754 }, 

755 "Old Irish": { 

756 "next": "Proto-Indo-European", # Has dual 

757 }, 

758 "Pennsylvania German": { 

759 "next": "German", 

760 }, 

761 "Phoenician": { 

762 "next": "semitic-group", 

763 }, 

764 "Phuthi": { 

765 "next": "bantu-group", 

766 }, 

767 "Pite Sami": { 

768 "next": "samojedic-group", 

769 }, 

770 "Polish": { 

771 "next": "slavic-group", 

772 }, 

773 "Portuguese": { 

774 "next": "romance-group", 

775 "genders": ["masculine", "feminine"], 

776 }, 

777 "Proto-Germanic": { 

778 "next": "Proto-Indo-European", # Has dual 

779 }, 

780 "Proto-Indo-European": { 

781 "numbers": ["singular", "dual", "plural"], 

782 }, 

783 "Proto-Samic": { 

784 "next": "samojedic-group", 

785 }, 

786 "Proto-Uralic": { 

787 "next": "uralic-group", 

788 }, 

789 "Raga": { 

790 "numbers": ["singular", "dual", "trial", "plural"], 

791 }, 

792 "Romagnol": { 

793 "next": "romance-group", 

794 }, 

795 "Romanian": { 

796 "next": "romance-group", 

797 }, 

798 "Romansch": { 

799 "next": "romance-group", 

800 }, 

801 "Russian": { 

802 "next": "slavic-group", 

803 "hdr_expand_first": set(["non-finite", "mood", "tense"]), 

804 "hdr_expand_cont": set(["tense", "number"]), 

805 "reuse_cellspan": "stop", 

806 }, 

807 "Rwanda-Rundi": { 

808 "next": "bantu-group", 

809 }, 

810 "Sanskrit": { 

811 "next": "Proto-Indo-European", 

812 }, 

813 "Sardinian": { 

814 "next": "romance-group", 

815 }, 

816 "Sassarese": { 

817 "next": "romance-group", 

818 }, 

819 "Scottish Gaelic": { 

820 "numbers": ["singular", "dual", "plural"], 

821 }, 

822 "Serbo-Croatian": { 

823 "next": "slavic-group", 

824 "numbers": ["singular", "dual", "paucal", "plural"], 

825 }, 

826 "Sicilian": { 

827 "next": "romance-group", 

828 }, 

829 "Skolt Sami": { 

830 "next": "samojedic-group", 

831 }, 

832 "Slovene": { 

833 "next": "slavic-group", 

834 }, 

835 "Shona": { 

836 "next": "bantu-group", 

837 }, 

838 "Sotho": { 

839 "next": "bantu-group", 

840 }, 

841 "South Levantine Arabic": { 

842 "next": "semitic-group", 

843 }, 

844 "Southern Ndebele": { 

845 "next": "bantu-group", 

846 }, 

847 "Spanish": { 

848 "next": "romance-group", 

849 "form_transformations": [ 

850 ["verb", "^no ", "", "negative"], 

851 ], 

852 "special_references": { 

853 "vos": "informal vos-form second-person singular", 

854 "ᵛᵒˢ": "informal vos-form second-person singular", 

855 "tú": "informal second-person singular", 

856 }, 

857 }, 

858 "Swahili": { 

859 "next": "bantu-group", 

860 }, 

861 "Swedish": { 

862 "hdr_expand_first": set(["referent"]), 

863 "hdr_expand_cont": set(["degree", "polarity"]), 

864 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

865 }, 

866 "Swazi": { 

867 "next": "bantu-group", 

868 }, 

869 # "Syriac": { 

870 # "next": "semitic-group", 

871 # }, 

872 "Tagalog": { 

873 "next": "austronesian-group", 

874 }, 

875 "Tausug": { 

876 "next": "austronesian-group", 

877 }, 

878 "Tigre": { 

879 "next": "semitic-group", 

880 }, 

881 "Tigrinya": { 

882 "next": "semitic-group", 

883 }, 

884 "Tongan": { 

885 "next": "austronesian-group", 

886 }, 

887 "Tsonga": { 

888 "next": "bantu-group", 

889 }, 

890 "Tswana": { 

891 "next": "bantu-group", 

892 }, 

893 "Tumbuka": { 

894 "next": "bantu-group", 

895 }, 

896 # "Tuscan": { 

897 # "next": "romance-group", 

898 # }, 

899 "Ugaritic": { 

900 "next": "semitic-group", 

901 }, 

902 "Ukrainian": { 

903 "next": "slavic-group", 

904 }, 

905 "Upper Sorbian": { 

906 "next": "slavic-group", 

907 }, 

908 # "Valencian": { 

909 # "next": "romance-group", 

910 # }, 

911 "Venetian": { 

912 "next": "romance-group", 

913 }, 

914 "Warlpiri": { 

915 "numbers": ["singular", "paucal", "plural"], 

916 }, 

917 "Xhosa": { 

918 "next": "bantu-group", 

919 }, 

920 "Zulu": { 

921 "next": "bantu-group", 

922 }, 

923 "ǃXóõ": { 

924 "next": "bantu-group", 

925 }, 

926} 

927 

928 

929# Sanity check lang_specific 

930# def_ls_keys = lang_specific["default"].keys() 

931# for k, v in lang_specific.items(): 

932# if k[0].isupper() and k not in languages_by_name: 

933# raise AssertionError( 

934# "key {!r} in lang_specific is not a valid language" 

935# .format(k)) 

936# assert isinstance(v, dict) 

937# for kk, vv in v.items(): 

938# if kk not in def_ls_keys and kk != "next": 

939# raise AssertionError("{} key {!r} not in default entry" 

940# .format(k, kk)) 

941# if kk in ("hdr_expand_first", "hdr_expand_cont"): 

942# if not isinstance(vv, set): 

943# raise AssertionError("{} key {!r} must be set" 

944# .format(lang, kk)) 

945# for t in vv: 

946# if t not in tag_categories: 

947# raise AssertionError("{} key {!r} invalid tag category {}" 

948# .format(k, kk, t)) 

949# elif kk in ("genders", "numbers", "persons", "strengths", "voices"): 

950# if not vv: 

951# continue 

952# if not isinstance(vv, (list, tuple, set)): 

953# raise AssertionError("{} key {!r} must be list/tuple/set" 

954# .format(k, kk)) 

955# for t in vv: 

956# if t not in valid_tags: 

957# raise AssertionError("{} key {!r} invalid tag {!r}" 

958# .format(k, kk, t)) 

959# elif kk == "lang_tag_mappings" and vv is not None: 

960# for pos, transf in vv.items(): 

961# assert pos in PARTS_OF_SPEECH 

962# assert isinstance(transf, dict) 

963# for pre, post in transf.items(): 

964# assert isinstance(pre, tuple) 

965# assert all(t in valid_tags for t in pre) 

966# assert isinstance(post, list) 

967# assert all(t in valid_tags for t in post) 

968# elif kk == "next": 

969# if vv not in lang_specific: 

970# raise AssertionError("{} key {!r} value {!r} is not defined" 

971# .format(k, kk, vv)) 

972 

973 

974def get_lang_conf(lang, field): 

975 """Returns the given field from language-specific data or "default" 

976 if the language is not listed or does not have the field.""" 

977 assert isinstance(lang, str) 

978 assert isinstance(field, str) 

979 while True: 

980 lconfigs = lang_specific.get(lang) 

981 if lconfigs is None: 

982 lang = "default" 

983 elif lang == "default" and field not in lconfigs: 983 ↛ 984line 983 didn't jump to line 984 because the condition on line 983 was never true

984 raise RuntimeError("Invalid lang_specific field {!r}".format(field)) 

985 else: 

986 if field in lconfigs: 

987 return lconfigs[field] 

988 lang = lconfigs.get("next", "default") 

989 

990 

991def lang_specific_tags(lang, pos, form): 

992 """Extracts tags from the word form itself in a language-specific way. 

993 This may also adjust the word form. 

994 For example, German inflected verb forms don't have person and number 

995 specified in the table, but include a pronoun. This returns adjusted 

996 form and a list of tags.""" 

997 assert isinstance(lang, str) 

998 assert isinstance(pos, str) 

999 assert isinstance(form, str) 

1000 rules = get_lang_conf(lang, "form_transformations") 

1001 for patpos, pattern, dst, tags in rules: 

1002 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > "" 

1003 if isinstance(patpos, tuple): 1003 ↛ 1004line 1003 didn't jump to line 1004 because the condition on line 1003 was never true

1004 for p in patpos: 

1005 assert p in PARTS_OF_SPEECH 

1006 if pos not in patpos: 

1007 continue 

1008 else: 

1009 assert patpos in PARTS_OF_SPEECH 

1010 if pos != patpos: 

1011 continue 

1012 m = re.search(pattern, form) 

1013 if not m: 

1014 continue 

1015 form = form[: m.start()] + dst + form[m.end() :] 

1016 tags = tags.split() 

1017 for t in tags: 

1018 assert t in valid_tags 

1019 return form, tags 

1020 return form, []