Coverage for src / wiktextract / extractor / en / lang_specific_configs.py: 82%

41 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 08:47 +0000

1# Language-specific configuration for various aspects of inflection table 

2# parsing. 

3 

4import re 

5from typing import Optional, TypedDict, Union 

6 

7from ...tags import valid_tags 

8from .parts_of_speech import PARTS_OF_SPEECH 

9 

10LangConfDict = TypedDict( 

11 "LangConfDict", 

12 { 

13 "next": str, 

14 "hdr_expand_first": set[str], 

15 "hdr_expand_cont": set[str], 

16 "animate_inanimate_remove": bool, 

17 "both_active_passive_remove": bool, 

18 "both_strong_weak_remove": bool, 

19 "definitenesses": list[str], 

20 "empty_row_resets": bool, 

21 "form_transformations": list[ 

22 # POS, pattern, replacement, tags 

23 tuple[str, str, str, str] 

24 ], # tag extraction, lang_specific_tags() 

25 "genders": Optional[list[str]], 

26 "imperative_no_tense": bool, 

27 "masc_only_animate": bool, # Slavic special 

28 "numbers": list[str], 

29 "persons": list[str], 

30 "pl_virile_nonvirile": bool, 

31 "reuse_cellspan": str, # stop/skip/reuse 

32 "skip_mood_mood": bool, 

33 "skip_tense_tense": bool, 

34 "stop_non_finite_non_finite": bool, 

35 "stop_non_finite_voice": bool, 

36 "stop_non_finite_tense": bool, 

37 "strengths": list[str], 

38 "virile_nonvirile_remove": bool, 

39 "voices": list[str], 

40 "special_phrase_splits": dict[ 

41 str, list[Union[list[str], str]] 

42 ], # value: (split phrase, tags) 

43 "form_replacements": dict[ 

44 str, Union[str, list[str]] 

45 ], # value: [replacement, tags] 

46 # Greek-style bracket semantics 

47 "parentheses_for_informal": bool, 

48 "square_brackets_for_rare": bool, 

49 "curly_brackets_for_archaic": bool, 

50 # Armenian; migrated old data here 

51 "lang_tag_mappings": Optional[ 

52 dict[str, dict[tuple[str, ...], list[str]]] 

53 ], 

54 # Spanish has a lot of "vos" and "tú" in its tables that look like 

55 # references, and they give their form certain tags. 

56 # Dict of references ("vos") that point to tag strings "first-person 

57 # singular" that *extend* tags. 

58 "special_references": Optional[dict[str, str]], 

59 # Some languages like Icelandic and Faroese have text cells in the 

60 # upper left that we'd like to ignore. 

61 "ignore_top_left_text_cell": bool, 

62 # Minor regex replacements for cleanup in parse_simple_table() 

63 "minor_text_cleanups": Optional[ 

64 dict[str, str] 

65 ], # dict of {regex: substitution} 

66 "articles_in_separate_columns": bool, 

67 # Cells to ignore in this language, unless the cell has the key 

68 # as a tag. 

69 "conditionally_ignored_cells": dict[str, list[str]], 

70 # dictionary, with the key being a tuple of POS strings so that 

71 # nouns can have different remove patterns from verbs, etc. 

72 "remove_text_patterns": dict[ 

73 tuple[str, ...], tuple[str | re.Pattern, ...] 

74 ] 

75 | None, 

76 }, 

77 total=False, 

78) 

79 

80lang_specific: dict[str, LangConfDict] = { 

81 "default": { 

82 "hdr_expand_first": set( 

83 [ 

84 "number", 

85 "mood", 

86 "referent", 

87 "aspect", 

88 "tense", 

89 "voice", 

90 "non-finite", 

91 "case", 

92 "possession", 

93 ] 

94 ), 

95 "hdr_expand_cont": set( 

96 [ 

97 "person", 

98 "gender", 

99 "number", 

100 "degree", 

101 "polarity", 

102 "voice", 

103 "misc", 

104 ] 

105 ), 

106 "animate_inanimate_remove": True, 

107 "both_active_passive_remove": True, 

108 "both_strong_weak_remove": True, 

109 "definitenesses": ["indefinite", "definite"], 

110 "empty_row_resets": False, 

111 "form_transformations": [], # tag extraction, lang_specific_tags() 

112 "genders": None, 

113 "imperative_no_tense": False, 

114 "masc_only_animate": False, # Slavic special 

115 "numbers": ["singular", "plural"], 

116 "persons": ["first-person", "second-person", "third-person"], 

117 "pl_virile_nonvirile": False, 

118 "reuse_cellspan": "skip", # stop/skip/reuse 

119 "skip_mood_mood": False, 

120 "skip_tense_tense": False, 

121 "stop_non_finite_non_finite": True, 

122 "stop_non_finite_voice": False, 

123 "stop_non_finite_tense": False, 

124 "strengths": ["strong", "weak"], 

125 "virile_nonvirile_remove": True, 

126 "voices": ["active", "passive"], 

127 "special_phrase_splits": {}, # value: (split phrase, tags) 

128 "form_replacements": {}, # value: [replacement, tags] 

129 # Greek-style bracket semantics 

130 "parentheses_for_informal": False, 

131 "square_brackets_for_rare": False, 

132 "curly_brackets_for_archaic": False, 

133 # Armenian; migrated old data here 

134 "lang_tag_mappings": None, 

135 # Spanish has a lot of "vos" and "tú" in its tables that look like 

136 # references, and they give their form certain tags. 

137 # Dict of references ("vos") that point to tag strings "first-person 

138 # singular" that *extend* tags. 

139 "special_references": None, 

140 # Some languages like Icelandic and Faroese have text cells in the 

141 # upper left that we'd like to ignore. 

142 "ignore_top_left_text_cell": False, 

143 # Minor regex replacements for cleanup in parse_simple_table() 

144 "minor_text_cleanups": None, # dict of {regex: substitution} 

145 "articles_in_separate_columns": False, 

146 # Cells to ignore in this language, unless the cell has the key 

147 # as a tag. 

148 "conditionally_ignored_cells": {}, 

149 "remove_text_patterns": None, 

150 }, 

151 "austronesian-group": { 

152 "numbers": ["singular", "dual", "plural"], 

153 }, 

154 "bantu-group": { 

155 "genders": None, 

156 }, 

157 "indo-european-group": { 

158 "genders": ["masculine", "feminine", "neuter"], 

159 "numbers": ["singular", "plural"], 

160 }, 

161 "romance-group": {}, 

162 "slavic-group": { 

163 "numbers": ["singular", "plural", "dual"], 

164 "masc_only_animate": True, 

165 }, 

166 "samojedic-group": { 

167 "next": "uralic-group", 

168 }, 

169 "semitic-group": { 

170 "numbers": ["singular", "dual", "plural"], 

171 "definitenesses": ["indefinite", "definite", "construct"], 

172 }, 

173 "uralic-group": { 

174 "numbers": ["singular", "dual", "plural"], 

175 }, 

176 "german-group": { # languages closely related to or offshot from German 

177 "next": "germanic-group", 

178 "articles_in_separate_columns": True, 

179 }, 

180 "germanic-group": { # Germanic languages as a whole 

181 "next": "indo-european-group", 

182 }, 

183 "Akkadian": { 

184 "next": "semitic-group", 

185 }, 

186 "Alemannic German": { 

187 "next": "German", 

188 }, 

189 "Amharic": { 

190 "next": "semitic-group", 

191 }, 

192 "Ancient Greek": { 

193 "next": "Proto-Indo-European", # Has dual 

194 "remove_text_patterns": { 

195 ("noun", "name"): ( 

196 # Used to remove the gendered article alternatives at the start 

197 # of table entries like ἰχθυοκένταυρος / Ancient Greek 

198 re.compile( 

199 r"(?m)^(ā |ai |hā |hai |hē |ho |ho / hē |ho, hē |hoi |" 

200 r"hoi / hai |hoi, hai |o |oi |tằ |tâ |taì |tâi |" 

201 r"taîs |tân |tān |tān |tâs |tā̀s |têi |tēî |têisĭ |" 

202 r"têisĭ |tḕn |tês |tò |tô |tṑ |tṑ |toi |toì |tôi |" 

203 r"toîn |toîs |toîsĭ |toîsĭ\(n\) |toîsĭn |toîs / taîs |" 

204 r"toîs, taîs |tôi, têi |tōî / tēî |tòn |tôn |" 

205 r"tòn / tḕn |tòn, tḕn |tòs |tṑs |tṑs |toû |toùs |" 

206 r"toùs / tā̀s |toùs, tā̀s |toû / tês |toû, tês )" 

207 ), 

208 # Main greek pattern 

209 re.compile( 

210 r"^(ᾱ |ᾱ̔ |αἰ |αἱ |ἡ |ὀ |ὁ |ὁ / ἡ |ὁ, ἡ |οἰ |οἱ |οἱ / αἱ |" 

211 r"οἱ, αἱ |τᾰ̀ |τᾶ |τᾷ |ταὶ |ταῖς |τᾶν |τᾱν |τᾱν |τᾶς |τᾱ̀ς |" 

212 r"τῇ |τὴν |τῆς |τῇσῐ |τῇσῐν |τὸ |τοι |τοὶ |τοῖ |τοῖν |" 

213 r"τοῖς |" 

214 r"τοῖσῐ / τοῖσῐν |τοῖς / ταῖς |τοῖς, ταῖς |τὸν |τὸν / τὴν |" 

215 r"τὸν, τὴν |τὸς |τοῦ |τοὺς |τοὺς / τᾱ̀ς |τοὺς, τᾱ̀ς |" 

216 r"τοῦ / τῆς |τοῦ, τῆς |τὼ |τῶ |τῷ |τῶν |τὼς |τὼς |" 

217 r"τῷ / τῇ |τῷ, τῇ |τὼ )" 

218 ), 

219 ), 

220 }, 

221 }, 

222 # "Anejom̃": { 

223 # "numbers": ["singular", "dual", "trial", "plural"], 

224 # }, 

225 "Arabic": { 

226 "next": "semitic-group", 

227 "numbers": [ 

228 "singular", 

229 "dual", 

230 "paucal", 

231 "plural", 

232 "collective", 

233 "singulative", 

234 ], 

235 "reuse_cellspan": "reuse", 

236 "hdr_expand_first": set(["number"]), 

237 "hdr_expand_cont": set( 

238 ["gender", "referent", "misc", "number", "class"] 

239 ), 

240 }, 

241 "Aragonese": { 

242 "next": "romance-group", 

243 }, 

244 "Armenian": { 

245 "lang_tag_mappings": { 

246 "noun": { 

247 ("possessive", "singular"): ["possessive", "possessed-single"], 

248 ("possessive", "plural"): ["possessive", "possessed-single"], 

249 }, 

250 }, 

251 }, 

252 "Aromanian": { 

253 "next": "romance-group", 

254 }, 

255 "Aramaic": { 

256 "next": "semitic-group", 

257 }, 

258 "Avestan": { 

259 "next": "Proto-Indo-European", 

260 }, 

261 "Bavarian": { 

262 "next": "German", 

263 }, 

264 "Baiso": { 

265 "numbers": ["singular", "paucal", "plural"], 

266 }, 

267 "Belarusian": { 

268 "next": "slavic-group", 

269 }, 

270 "Bende": { 

271 "next": "bantu-group", 

272 }, 

273 # "Berber": { 

274 # "definitenesses": ["indefinite", "definite", "construct"], 

275 # }, 

276 "Catalan": { 

277 "next": "romance-group", 

278 }, 

279 "Chichewa": { 

280 "next": "bantu-group", 

281 }, 

282 "Chimwiini": { 

283 "next": "bantu-group", 

284 }, 

285 "Cimbrian": { 

286 "next": "German", 

287 }, 

288 "Corsican": { 

289 "next": "romance-group", 

290 }, 

291 "Czech": { 

292 "next": "slavic-group", 

293 "hdr_expand_first": set(["tense", "mood", "non-finite"]), 

294 "hdr_expand_cont": set(["tense", "mood", "voice"]), 

295 }, 

296 "Dalmatian": { 

297 "next": "romance-group", 

298 }, 

299 "Danish": { 

300 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

301 "remove_text_patterns": { 

302 # tuples need the comma to be happy 

303 ("noun",): (re.compile(r"^\(as a measure\) "),), 

304 }, 

305 }, 

306 "Eblaite": { 

307 "next": "semitic-group", 

308 }, 

309 "Egyptian": { 

310 "definitenesses": ["indefinite", "definite", "construct"], 

311 }, 

312 "Emilian": { 

313 "next": "romance-group", 

314 }, 

315 "English": { 

316 "stop_non_finite_tense": True, # affect/English/Verb 

317 "form_transformations": [ 

318 ("verb", r"^\(to\) ", "", ""), 

319 ("verb", "^to ", "", ""), 

320 ("verb", r"^I ", "", "first-person singular"), 

321 ("verb", r"^you ", "", "second-person"), 

322 ("verb", r"^he ", "", "third-person singular"), 

323 ("verb", r"^we ", "", "first-person plural"), 

324 ("verb", r"^they ", "", "third-person"), 

325 ("verb", r"^it ", "", "third-person singular"), 

326 ("verb", r"^thou ", "", "second-person singular"), 

327 ("verb", r"^ye ", "", "second-person plural"), 

328 ("verb", r" \(thou\)$", "", "second-person singular"), 

329 ("verb", r" \(ye\)$", "", "second-person plural"), 

330 ("verb", r"^he/she/it ", "", "third-person singular"), 

331 ("verb", r"^he/she/it/they ", "", "third-person singular"), 

332 ("verb", r"\bhim/her/it/them ", "", "third-person singular"), 

333 ("verb", r"\bthem ", "", "third-person"), 

334 ("verb", r"\bus ", "", "first-person plural"), 

335 ("verb", r"\bme ", "", "first-person singular"), 

336 ], 

337 "form_replacements": { 

338 "let’s be": ["let's be", "first-person plural pronoun-included"], 

339 }, 

340 "special_phrase_splits": { 

341 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"], 

342 "we are (’re)/be/been": [ 

343 ["are (’re)", "be", "been"], 

344 "first-person plural", 

345 ], 

346 "thou art (’rt)/beest": [ 

347 ["art (’rt)", "beest"], 

348 "second-person singular", 

349 ], 

350 "ye are (’re)/be/been": [ 

351 ["are (’re)", "be", "been"], 

352 "second-person plural", 

353 ], 

354 "thou be/beest": [["be", "beest"], "second-person singular"], 

355 "he/she/it is (’s)/beeth/bes": [ 

356 ["is (’s)", "beeth", "bes"], 

357 "third-person singular", 

358 ], 

359 "they are (’re)/be/been": [ 

360 ["are (’re)", "be", "been"], 

361 "third-person plural", 

362 ], 

363 "thou wert/wast": [["wert", "wast"], "second-person singular"], 

364 "thou were/wert": [["were", "wert"], "second-person singular"], 

365 "there has been": [["there has been"], "singular"], 

366 "there have been": [["there have been"], "plural"], 

367 "there is ('s)": [["there is", "there's"], "singular"], 

368 "there are ('re)": [["there are", "there're"], "plural"], 

369 "there was": [["there was"], "singular"], 

370 "there were": [["there were"], "plural"], 

371 }, 

372 }, 

373 "Estonian": { 

374 "hdr_expand_first": set(["non-finite"]), 

375 "hdr_expand_cont": set(["voice"]), 

376 }, 

377 "Faroese": { 

378 "ignore_top_left_text_cell": True, 

379 }, 

380 "Fijian": { 

381 "numbers": ["singular", "paucal", "plural"], 

382 }, 

383 "Finnish": { 

384 "hdr_expand_first": set([]), 

385 }, 

386 "French": { 

387 "next": "romance-group", 

388 }, 

389 "Friulian": { 

390 "next": "romance-group", 

391 }, 

392 "Galician": { 

393 "next": "romance-group", 

394 }, 

395 "German": { 

396 "next": "german-group", 

397 "form_transformations": [ 

398 ("verb", "^ich ", "", "first-person singular"), 

399 ("verb", "^du ", "", "second-person singular"), 

400 ("verb", "^er ", "", "third-person singular"), 

401 ("verb", "^wir ", "", "first-person plural"), 

402 ("verb", "^ihr ", "", "second-person plural"), 

403 ("verb", "^sie ", "", "third-person plural"), 

404 ( 

405 "verb", 

406 "^dass ich ", 

407 "", 

408 "first-person singular subordinate-clause", 

409 ), 

410 ( 

411 "verb", 

412 "^dass du ", 

413 "", 

414 "second-person singular subordinate-clause", 

415 ), 

416 ( 

417 "verb", 

418 "^dass er ", 

419 "", 

420 "third-person singular subordinate-clause", 

421 ), 

422 ( 

423 "verb", 

424 "^dass wir ", 

425 "", 

426 "first-person plural subordinate-clause", 

427 ), 

428 ( 

429 "verb", 

430 "^dass ihr ", 

431 "", 

432 "second-person plural subordinate-clause", 

433 ), 

434 ( 

435 "verb", 

436 "^dass sie ", 

437 "", 

438 "third-person plural subordinate-clause", 

439 ), 

440 ("verb", r" \(du\)$", "", "second-person singular"), 

441 ("verb", r" \(ihr\)$", "", "second-person plural"), 

442 ("adj", "^er ist ", "", "masculine singular"), 

443 ("adj", "^sie ist ", "", "feminine singular"), 

444 ("adj", "^es ist ", "", "neuter singular"), 

445 ("adj", "^sie sind ", "", "plural"), 

446 ("adj", "^keine ", "keine ", "negative"), 

447 ("adj", "^keiner ", "keiner ", "negative"), 

448 ("adj", "^keinen ", "keinen ", "negative"), 

449 ], 

450 "conditionally_ignored_cells": { 

451 "definite": [ 

452 "der", 

453 "die", 

454 "das", 

455 "des", 

456 "dem", 

457 "den", 

458 ], 

459 "indefinite": [ 

460 "ein", 

461 "eine", 

462 "eines", 

463 "einer", 

464 "einem", 

465 "einen", 

466 ], 

467 "negative": [ 

468 "kein", 

469 "keine", 

470 "keiner", 

471 "keinen", 

472 ], 

473 }, 

474 }, 

475 "German Low German": { 

476 "next": "German", 

477 "hdr_expand_first": set(["mood", "non-finite"]), 

478 "hdr_expand_cont": set(["tense"]), 

479 }, 

480 "Gothic": { 

481 "next": "Proto-Indo-European", # Has dual 

482 }, 

483 "Greek": { 

484 "next": "indo-european-group", 

485 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]), 

486 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]), 

487 "imperative_no_tense": True, 

488 "reuse_cellspan": "reuse", 

489 "skip_mood_mood": True, 

490 "skip_tense_tense": True, 

491 # είμαι/Greek 

492 "parentheses_for_informal": True, 

493 "square_brackets_for_rare": True, 

494 "curly_brackets_for_archaic": True, 

495 # For greek originally 

496 "minor_text_cleanups": { 

497 r"\s+➤\s*$": "", 

498 }, 

499 }, 

500 "Hawaiian": { 

501 "next": "austronesian-group", 

502 }, 

503 "Hebrew": { 

504 "next": "semitic-group", 

505 }, 

506 "Hijazi Arabic": { 

507 "next": "semitic-group", 

508 }, 

509 "Hopi": { 

510 "numbers": ["singular", "paucal", "plural"], 

511 }, 

512 "Hungarian": { 

513 "hdr_expand_first": set([]), 

514 "hdr_expand_cont": set([]), 

515 }, 

516 "Hunsrik": { 

517 "next": "German", 

518 }, 

519 "Icelandic": { 

520 "ignore_top_left_text_cell": True, 

521 }, 

522 "Ilokano": { 

523 "next": "austronesian-group", 

524 }, 

525 "Inari Sami": { 

526 "next": "samojedic-group", 

527 }, 

528 "Inuktitut": { 

529 "numbers": ["singular", "dual", "plural"], 

530 }, 

531 "Italian": { 

532 "next": "romance-group", 

533 "hdr_expand_first": set(["mood", "tense"]), 

534 "hdr_expand_cont": set(["person", "register", "number", "misc"]), 

535 "form_transformations": [ 

536 ("verb", "^non ", "", "negative"), 

537 ], 

538 }, 

539 "Irish": { 

540 "next": "Old Irish", 

541 "genders": ["masculine", "feminine"], 

542 }, 

543 "Kamba": { 

544 "next": "bantu-group", 

545 }, 

546 "Kapampangan": { 

547 "next": "austronesian-group", 

548 }, 

549 # "Khoe": { 

550 # "numbers": ["singular", "dual", "plural"], 

551 # }, 

552 "Kikuyu": { 

553 "next": "bantu-group", 

554 }, 

555 "Ladin": { 

556 "next": "romance-group", 

557 }, 

558 # "Larike": { 

559 # "numbers": ["singular", "dual", "trial", "plural"], 

560 # }, 

561 "Latin": { 

562 "next": "romance-group", 

563 "stop_non_finite_voice": True, 

564 }, 

565 "Latvian": { 

566 "empty_row_resets": True, 

567 }, 

568 "Ligurian": { 

569 "next": "romance-group", 

570 }, 

571 "Lihir": { 

572 "numbers": ["singular", "dual", "trial", "paucal", "plural"], 

573 }, 

574 "Lingala": { 

575 "next": "bantu-group", 

576 }, 

577 "Lombard": { 

578 "next": "romance-group", 

579 }, 

580 "Lower Sorbian": { 

581 "next": "slavic-group", 

582 }, 

583 "Luganda": { 

584 "next": "bantu-group", 

585 }, 

586 "Lule Sami": { 

587 "next": "samojedic-group", 

588 }, 

589 "Luxembourgish": { 

590 "next": "German", 

591 }, 

592 "Maltese": { 

593 "next": "semitic-group", 

594 }, 

595 "Maore Comorian": { 

596 "next": "bantu-group", 

597 }, 

598 "Masaba": { 

599 "next": "bantu-group", 

600 }, 

601 "Mirandese": { 

602 "next": "romance-group", 

603 }, 

604 "Moroccan Arabic": { 

605 "next": "semitic-group", 

606 }, 

607 # "Motuna": { 

608 # "numbers": ["singular", "paucal", "plural"], 

609 # }, 

610 "Mwali Comorian": { 

611 "next": "bantu-group", 

612 }, 

613 "Mwani": { 

614 "next": "bantu-group", 

615 }, 

616 "Navajo": { 

617 "numbers": [ 

618 "singular", 

619 "plural", 

620 "dual", 

621 "duoplural", 

622 ], 

623 }, 

624 "Neapolitan": { 

625 "next": "romance-group", 

626 }, 

627 "Nenets": { 

628 "next": "uralic-group", 

629 }, 

630 "Ngazidja Comorian": { 

631 "next": "bantu-group", 

632 }, 

633 "Niuean": { 

634 "next": "austronesian-group", 

635 }, 

636 "Northern Kurdish": { 

637 "numbers": ["singular", "paucal", "plural"], 

638 }, 

639 "Northern Ndebele": { 

640 "next": "bantu-group", 

641 }, 

642 "Northern Sami": { 

643 "next": "samojedic-group", 

644 }, 

645 # "Mussau": { 

646 # "numbers": ["singular", "dual", "trial", "plural"], 

647 # }, 

648 "Nyankole": { 

649 "next": "bantu-group", 

650 }, 

651 "Occitan": { 

652 "next": "romance-group", 

653 }, 

654 "Old Church Slavonic": { 

655 "next": "Proto-Indo-European", # Has dual 

656 }, 

657 "Old English": { 

658 "next": "Proto-Indo-European", # Had dual in pronouns 

659 }, 

660 "Old Norse": { 

661 "next": "Proto-Indo-European", # Had dual in pronouns 

662 }, 

663 "Old Irish": { 

664 "next": "Proto-Indo-European", # Has dual 

665 }, 

666 "Pennsylvania German": { 

667 "next": "German", 

668 }, 

669 "Phoenician": { 

670 "next": "semitic-group", 

671 }, 

672 "Phuthi": { 

673 "next": "bantu-group", 

674 }, 

675 "Pite Sami": { 

676 "next": "samojedic-group", 

677 }, 

678 "Polish": { 

679 "next": "slavic-group", 

680 }, 

681 "Portuguese": { 

682 "next": "romance-group", 

683 "genders": ["masculine", "feminine"], 

684 }, 

685 "Proto-Germanic": { 

686 "next": "Proto-Indo-European", # Has dual 

687 }, 

688 "Proto-Indo-European": { 

689 "numbers": ["singular", "dual", "plural"], 

690 }, 

691 "Proto-Samic": { 

692 "next": "samojedic-group", 

693 }, 

694 "Proto-Uralic": { 

695 "next": "uralic-group", 

696 }, 

697 "Raga": { 

698 "numbers": ["singular", "dual", "trial", "plural"], 

699 }, 

700 "Romagnol": { 

701 "next": "romance-group", 

702 }, 

703 "Romanian": { 

704 "next": "romance-group", 

705 }, 

706 "Romansch": { 

707 "next": "romance-group", 

708 }, 

709 "Russian": { 

710 "next": "slavic-group", 

711 "hdr_expand_first": set(["non-finite", "mood", "tense"]), 

712 "hdr_expand_cont": set(["tense", "number"]), 

713 "reuse_cellspan": "stop", 

714 }, 

715 "Rwanda-Rundi": { 

716 "next": "bantu-group", 

717 }, 

718 "Sanskrit": { 

719 "next": "Proto-Indo-European", 

720 }, 

721 "Sardinian": { 

722 "next": "romance-group", 

723 }, 

724 "Sassarese": { 

725 "next": "romance-group", 

726 }, 

727 "Scottish Gaelic": { 

728 "numbers": ["singular", "dual", "plural"], 

729 }, 

730 "Serbo-Croatian": { 

731 "next": "slavic-group", 

732 "numbers": ["singular", "dual", "paucal", "plural"], 

733 }, 

734 "Sicilian": { 

735 "next": "romance-group", 

736 }, 

737 "Skolt Sami": { 

738 "next": "samojedic-group", 

739 }, 

740 "Slovene": { 

741 "next": "slavic-group", 

742 }, 

743 "Shona": { 

744 "next": "bantu-group", 

745 }, 

746 "Sotho": { 

747 "next": "bantu-group", 

748 }, 

749 "South Levantine Arabic": { 

750 "next": "semitic-group", 

751 }, 

752 "Southern Ndebele": { 

753 "next": "bantu-group", 

754 }, 

755 "Spanish": { 

756 "next": "romance-group", 

757 "form_transformations": [ 

758 ("verb", "^no ", "", "negative"), 

759 ], 

760 "special_references": { 

761 "vos": "informal vos-form second-person singular", 

762 "ᵛᵒˢ": "informal vos-form second-person singular", 

763 "tú": "informal second-person singular", 

764 }, 

765 }, 

766 "Swahili": { 

767 "next": "bantu-group", 

768 }, 

769 "Swedish": { 

770 "hdr_expand_first": set(["referent"]), 

771 "hdr_expand_cont": set(["degree", "polarity"]), 

772 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

773 }, 

774 "Swazi": { 

775 "next": "bantu-group", 

776 }, 

777 # "Syriac": { 

778 # "next": "semitic-group", 

779 # }, 

780 "Tagalog": { 

781 "next": "austronesian-group", 

782 }, 

783 "Tausug": { 

784 "next": "austronesian-group", 

785 }, 

786 "Tigre": { 

787 "next": "semitic-group", 

788 }, 

789 "Tigrinya": { 

790 "next": "semitic-group", 

791 }, 

792 "Tongan": { 

793 "next": "austronesian-group", 

794 }, 

795 "Tsonga": { 

796 "next": "bantu-group", 

797 }, 

798 "Tswana": { 

799 "next": "bantu-group", 

800 }, 

801 "Tumbuka": { 

802 "next": "bantu-group", 

803 }, 

804 # "Tuscan": { 

805 # "next": "romance-group", 

806 # }, 

807 "Ugaritic": { 

808 "next": "semitic-group", 

809 }, 

810 "Ukrainian": { 

811 "next": "slavic-group", 

812 }, 

813 "Upper Sorbian": { 

814 "next": "slavic-group", 

815 }, 

816 # "Valencian": { 

817 # "next": "romance-group", 

818 # }, 

819 "Venetian": { 

820 "next": "romance-group", 

821 }, 

822 "Warlpiri": { 

823 "numbers": ["singular", "paucal", "plural"], 

824 }, 

825 "Xhosa": { 

826 "next": "bantu-group", 

827 }, 

828 "Zulu": { 

829 "next": "bantu-group", 

830 }, 

831 "ǃXóõ": { 

832 "next": "bantu-group", 

833 }, 

834} 

835 

836 

837# Sanity check lang_specific 

838# def_ls_keys = lang_specific["default"].keys() 

839# for k, v in lang_specific.items(): 

840# if k[0].isupper() and k not in languages_by_name: 

841# raise AssertionError( 

842# "key {!r} in lang_specific is not a valid language" 

843# .format(k)) 

844# assert isinstance(v, dict) 

845# for kk, vv in v.items(): 

846# if kk not in def_ls_keys and kk != "next": 

847# raise AssertionError("{} key {!r} not in default entry" 

848# .format(k, kk)) 

849# if kk in ("hdr_expand_first", "hdr_expand_cont"): 

850# if not isinstance(vv, set): 

851# raise AssertionError("{} key {!r} must be set" 

852# .format(lang, kk)) 

853# for t in vv: 

854# if t not in tag_categories: 

855# raise AssertionError("{} key {!r} invalid tag category {}" 

856# .format(k, kk, t)) 

857# elif kk in ("genders", "numbers", "persons", "strengths", "voices"): 

858# if not vv: 

859# continue 

860# if not isinstance(vv, (list, tuple, set)): 

861# raise AssertionError("{} key {!r} must be list/tuple/set" 

862# .format(k, kk)) 

863# for t in vv: 

864# if t not in valid_tags: 

865# raise AssertionError("{} key {!r} invalid tag {!r}" 

866# .format(k, kk, t)) 

867# elif kk == "lang_tag_mappings" and vv is not None: 

868# for pos, transf in vv.items(): 

869# assert pos in PARTS_OF_SPEECH 

870# assert isinstance(transf, dict) 

871# for pre, post in transf.items(): 

872# assert isinstance(pre, tuple) 

873# assert all(t in valid_tags for t in pre) 

874# assert isinstance(post, list) 

875# assert all(t in valid_tags for t in post) 

876# elif kk == "next": 

877# if vv not in lang_specific: 

878# raise AssertionError("{} key {!r} value {!r} is not defined" 

879# .format(k, kk, vv)) 

880 

881 

882def get_lang_conf(lang, field): 

883 """Returns the given field from language-specific data or "default" 

884 if the language is not listed or does not have the field.""" 

885 assert isinstance(lang, str) 

886 assert isinstance(field, str) 

887 while True: 

888 lconfigs = lang_specific.get(lang) 

889 if lconfigs is None: 

890 lang = "default" 

891 elif lang == "default" and field not in lconfigs: 891 ↛ 892line 891 didn't jump to line 892 because the condition on line 891 was never true

892 raise RuntimeError("Invalid lang_specific field {!r}".format(field)) 

893 else: 

894 if field in lconfigs: 

895 return lconfigs[field] 

896 lang = lconfigs.get("next", "default") 

897 

898 

899def lang_specific_tags(lang, pos, form): 

900 """Extracts tags from the word form itself in a language-specific way. 

901 This may also adjust the word form. 

902 For example, German inflected verb forms don't have person and number 

903 specified in the table, but include a pronoun. This returns adjusted 

904 form and a list of tags.""" 

905 assert isinstance(lang, str) 

906 assert isinstance(pos, str) 

907 assert isinstance(form, str) 

908 rules = get_lang_conf(lang, "form_transformations") 

909 for patpos, pattern, dst, tags in rules: 

910 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > "" 

911 if isinstance(patpos, tuple): 911 ↛ 912line 911 didn't jump to line 912 because the condition on line 911 was never true

912 for p in patpos: 

913 assert p in PARTS_OF_SPEECH 

914 if pos not in patpos: 

915 continue 

916 else: 

917 assert patpos in PARTS_OF_SPEECH 

918 if pos != patpos: 

919 continue 

920 m = re.search(pattern, form) 

921 if not m: 

922 continue 

923 form = form[: m.start()] + dst + form[m.end() :] 

924 tags = tags.split() 

925 for t in tags: 

926 assert t in valid_tags 

927 return form, tags 

928 return form, []