Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 96%

36 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1# Language-specific configuration for various aspects of inflection table 

2# parsing. 

3 

4import re 

5from typing import Optional, TypedDict, Union 

6 

7from ...tags import valid_tags 

8from .parts_of_speech import PARTS_OF_SPEECH 

9 

10LangConfDict = TypedDict( 

11 "LangConfDict", 

12 { 

13 "next": str, 

14 "hdr_expand_first": set[str], 

15 "hdr_expand_cont": set[str], 

16 "animate_inanimate_remove": bool, 

17 "both_active_passive_remove": bool, 

18 "both_strong_weak_remove": bool, 

19 "definitenesses": list[str], 

20 "empty_row_resets": bool, 

21 "form_transformations": list[ 

22 list[str] 

23 ], # tag extraction, lang_specific_tags() 

24 "genders": Optional[list[str]], 

25 "imperative_no_tense": bool, 

26 "masc_only_animate": bool, # Slavic special 

27 "numbers": list[str], 

28 "persons": list[str], 

29 "pl_virile_nonvirile": bool, 

30 "reuse_cellspan": str, # stop/skip/reuse 

31 "skip_mood_mood": bool, 

32 "skip_tense_tense": bool, 

33 "stop_non_finite_non_finite": bool, 

34 "stop_non_finite_voice": bool, 

35 "stop_non_finite_tense": bool, 

36 "strengths": list[str], 

37 "virile_nonvirile_remove": bool, 

38 "voices": list[str], 

39 "special_phrase_splits": dict[ 

40 str, list[Union[list[str], str]] 

41 ], # value: (split phrase, tags) 

42 "form_replacements": dict[ 

43 str, Union[str, list[str]] 

44 ], # value: [replacement, tags] 

45 # Greek-style bracket semantics 

46 "parentheses_for_informal": bool, 

47 "square_brackets_for_rare": bool, 

48 "curly_brackets_for_archaic": bool, 

49 # Armenian; migrated old data here 

50 "lang_tag_mappings": Optional[ 

51 dict[str, dict[tuple[str, ...], list[str]]] 

52 ], 

53 # Spanish has a lot of "vos" and "tú" in its tables that look like 

54 # references, and they give their form certain tags. 

55 # Dict of references ("vos") that point to tag strings "first-person 

56 # singular" that *extend* tags. 

57 "special_references": Optional[dict[str, str]], 

58 # Some languages like Icelandic and Faroese have text cells in the 

59 # upper left that we'd like to ignore. 

60 "ignore_top_left_text_cell": bool, 

61 # Minor regex replacements for cleanup in parse_simple_table() 

62 "minor_text_cleanups": Optional[ 

63 dict[str, str] 

64 ], # dict of {regex: substitution} 

65 "articles_in_separate_columns": bool, 

66 # Cells to ignore in this language, unless the cell has the key 

67 # as a tag. 

68 "conditionally_ignored_cells": dict[str, list[str]], 

69 }, 

70 total=False, 

71) 

72 

73lang_specific: dict[str, LangConfDict] = { 

74 "default": { 

75 "hdr_expand_first": set( 

76 [ 

77 "number", 

78 "mood", 

79 "referent", 

80 "aspect", 

81 "tense", 

82 "voice", 

83 "non-finite", 

84 "case", 

85 "possession", 

86 ] 

87 ), 

88 "hdr_expand_cont": set( 

89 [ 

90 "person", 

91 "gender", 

92 "number", 

93 "degree", 

94 "polarity", 

95 "voice", 

96 "misc", 

97 ] 

98 ), 

99 "animate_inanimate_remove": True, 

100 "both_active_passive_remove": True, 

101 "both_strong_weak_remove": True, 

102 "definitenesses": ["indefinite", "definite"], 

103 "empty_row_resets": False, 

104 "form_transformations": [], # tag extraction, lang_specific_tags() 

105 "genders": None, 

106 "imperative_no_tense": False, 

107 "masc_only_animate": False, # Slavic special 

108 "numbers": ["singular", "plural"], 

109 "persons": ["first-person", "second-person", "third-person"], 

110 "pl_virile_nonvirile": False, 

111 "reuse_cellspan": "skip", # stop/skip/reuse 

112 "skip_mood_mood": False, 

113 "skip_tense_tense": False, 

114 "stop_non_finite_non_finite": True, 

115 "stop_non_finite_voice": False, 

116 "stop_non_finite_tense": False, 

117 "strengths": ["strong", "weak"], 

118 "virile_nonvirile_remove": True, 

119 "voices": ["active", "passive"], 

120 "special_phrase_splits": {}, # value: (split phrase, tags) 

121 "form_replacements": {}, # value: [replacement, tags] 

122 # Greek-style bracket semantics 

123 "parentheses_for_informal": False, 

124 "square_brackets_for_rare": False, 

125 "curly_brackets_for_archaic": False, 

126 # Armenian; migrated old data here 

127 "lang_tag_mappings": None, 

128 # Spanish has a lot of "vos" and "tú" in its tables that look like 

129 # references, and they give their form certain tags. 

130 # Dict of references ("vos") that point to tag strings "first-person 

131 # singular" that *extend* tags. 

132 "special_references": None, 

133 # Some languages like Icelandic and Faroese have text cells in the 

134 # upper left that we'd like to ignore. 

135 "ignore_top_left_text_cell": False, 

136 # Minor regex replacements for cleanup in parse_simple_table() 

137 "minor_text_cleanups": None, # dict of {regex: substitution} 

138 "articles_in_separate_columns": False, 

139 # Cells to ignore in this language, unless the cell has the key 

140 # as a tag. 

141 "conditionally_ignored_cells": {}, 

142 }, 

143 "austronesian-group": { 

144 "numbers": ["singular", "dual", "plural"], 

145 }, 

146 "bantu-group": { 

147 "genders": None, 

148 }, 

149 "indo-european-group": { 

150 "genders": ["masculine", "feminine", "neuter"], 

151 "numbers": ["singular", "plural"], 

152 }, 

153 "romance-group": {}, 

154 "slavic-group": { 

155 "numbers": ["singular", "plural", "dual"], 

156 "masc_only_animate": True, 

157 }, 

158 "samojedic-group": { 

159 "next": "uralic-group", 

160 }, 

161 "semitic-group": { 

162 "numbers": ["singular", "dual", "plural"], 

163 "definitenesses": ["indefinite", "definite", "construct"], 

164 }, 

165 "uralic-group": { 

166 "numbers": ["singular", "dual", "plural"], 

167 }, 

168 "german-group": { # languages closely related to or offshot from German 

169 "next": "germanic-group", 

170 "articles_in_separate_columns": True, 

171 }, 

172 "germanic-group": { # Germanic languages as a whole 

173 "next": "indo-european-group", 

174 }, 

175 "Akkadian": { 

176 "next": "semitic-group", 

177 }, 

178 "Alemannic German": { 

179 "next": "German", 

180 }, 

181 "Amharic": { 

182 "next": "semitic-group", 

183 }, 

184 "Ancient Greek": { 

185 "next": "Proto-Indo-European", # Has dual 

186 }, 

187 # "Anejom̃": { 

188 # "numbers": ["singular", "dual", "trial", "plural"], 

189 # }, 

190 "Arabic": { 

191 "next": "semitic-group", 

192 "numbers": [ 

193 "singular", 

194 "dual", 

195 "paucal", 

196 "plural", 

197 "collective", 

198 "singulative", 

199 ], 

200 "reuse_cellspan": "reuse", 

201 "hdr_expand_first": set(["number"]), 

202 "hdr_expand_cont": set( 

203 ["gender", "referent", "misc", "number", "class"] 

204 ), 

205 }, 

206 "Aragonese": { 

207 "next": "romance-group", 

208 }, 

209 "Armenian": { 

210 "lang_tag_mappings": { 

211 "noun": { 

212 ("possessive", "singular"): ["possessive", "possessed-single"], 

213 ("possessive", "plural"): ["possessive", "possessed-single"], 

214 }, 

215 }, 

216 }, 

217 "Aromanian": { 

218 "next": "romance-group", 

219 }, 

220 "Aramaic": { 

221 "next": "semitic-group", 

222 }, 

223 "Avestan": { 

224 "next": "Proto-Indo-European", 

225 }, 

226 "Bavarian": { 

227 "next": "German", 

228 }, 

229 "Baiso": { 

230 "numbers": ["singular", "paucal", "plural"], 

231 }, 

232 "Belarusian": { 

233 "next": "slavic-group", 

234 }, 

235 "Bende": { 

236 "next": "bantu-group", 

237 }, 

238 # "Berber": { 

239 # "definitenesses": ["indefinite", "definite", "construct"], 

240 # }, 

241 "Catalan": { 

242 "next": "romance-group", 

243 }, 

244 "Chichewa": { 

245 "next": "bantu-group", 

246 }, 

247 "Chimwiini": { 

248 "next": "bantu-group", 

249 }, 

250 "Cimbrian": { 

251 "next": "German", 

252 }, 

253 "Corsican": { 

254 "next": "romance-group", 

255 }, 

256 "Czech": { 

257 "next": "slavic-group", 

258 "hdr_expand_first": set(["tense", "mood", "non-finite"]), 

259 "hdr_expand_cont": set(["tense", "mood", "voice"]), 

260 }, 

261 "Dalmatian": { 

262 "next": "romance-group", 

263 }, 

264 "Danish": { 

265 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

266 "form_transformations": [ 

267 ["noun", r"^\(as a measure\) ", "", ""], 

268 ], 

269 }, 

270 "Eblaite": { 

271 "next": "semitic-group", 

272 }, 

273 "Egyptian": { 

274 "definitenesses": ["indefinite", "definite", "construct"], 

275 }, 

276 "Emilian": { 

277 "next": "romance-group", 

278 }, 

279 "English": { 

280 "stop_non_finite_tense": True, # affect/English/Verb 

281 "form_transformations": [ 

282 ["verb", r"^\(to\) ", "", ""], 

283 ["verb", "^to ", "", ""], 

284 ["verb", r"^I ", "", "first-person singular"], 

285 ["verb", r"^you ", "", "second-person"], 

286 ["verb", r"^he ", "", "third-person singular"], 

287 ["verb", r"^we ", "", "first-person plural"], 

288 ["verb", r"^you ", "", "second-person plural"], 

289 ["verb", r"^they ", "", "third-person plural"], 

290 ["verb", r"^it ", "", "third-person singular"], 

291 ["verb", r"^thou ", "", "second-person singular"], 

292 ["verb", r"^ye ", "", "second-person plural"], 

293 ["verb", r" \(thou\)$", "", "second-person singular"], 

294 ["verb", r" \(ye\)$", "", "second-person plural"], 

295 ["verb", r"^he/she/it ", "", "third-person singular"], 

296 ["verb", r"^he/she/it/they ", "", "third-person singular"], 

297 ["verb", r"\bhim/her/it/them ", "", "third-person singular"], 

298 ["verb", r"\bthem ", "", "third-person plural"], 

299 ["verb", r"\bus ", "", "first-person plural"], 

300 ["verb", r"\bme ", "", "first-person singular"], 

301 ], 

302 "form_replacements": { 

303 "let’s be": ["let's be", "first-person plural pronoun-included"], 

304 }, 

305 "special_phrase_splits": { 

306 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"], 

307 "we are (’re)/be/been": [ 

308 ["are (’re)", "be", "been"], 

309 "first-person plural", 

310 ], 

311 "thou art (’rt)/beest": [ 

312 ["art (’rt)", "beest"], 

313 "second-person singular", 

314 ], 

315 "ye are (’re)/be/been": [ 

316 ["are (’re)", "be", "been"], 

317 "second-person plural", 

318 ], 

319 "thou be/beest": [["be", "beest"], "second-person singular"], 

320 "he/she/it is (’s)/beeth/bes": [ 

321 ["is (’s)", "beeth", "bes"], 

322 "third-person singular", 

323 ], 

324 "they are (’re)/be/been": [ 

325 ["are (’re)", "be", "been"], 

326 "third-person plural", 

327 ], 

328 "thou wert/wast": [["wert", "wast"], "second-person singular"], 

329 "thou were/wert": [["were", "wert"], "second-person singular"], 

330 "there has been": [["there has been"], "singular"], 

331 "there have been": [["there have been"], "plural"], 

332 "there is ('s)": [["there is", "there's"], "singular"], 

333 "there are ('re)": [["there are", "there're"], "plural"], 

334 "there was": [["there was"], "singular"], 

335 "there were": [["there were"], "plural"], 

336 }, 

337 }, 

338 "Estonian": { 

339 "hdr_expand_first": set(["non-finite"]), 

340 "hdr_expand_cont": set(["voice"]), 

341 }, 

342 "Faroese": { 

343 "ignore_top_left_text_cell": True, 

344 }, 

345 "Fijian": { 

346 "numbers": ["singular", "paucal", "plural"], 

347 }, 

348 "Finnish": { 

349 "hdr_expand_first": set([]), 

350 }, 

351 "French": { 

352 "next": "romance-group", 

353 }, 

354 "Friulian": { 

355 "next": "romance-group", 

356 }, 

357 "Galician": { 

358 "next": "romance-group", 

359 }, 

360 "German": { 

361 "next": "german-group", 

362 "form_transformations": [ 

363 ["verb", "^ich ", "", "first-person singular"], 

364 ["verb", "^du ", "", "second-person singular"], 

365 ["verb", "^er ", "", "third-person singular"], 

366 ["verb", "^wir ", "", "first-person plural"], 

367 ["verb", "^ihr ", "", "second-person plural"], 

368 ["verb", "^sie ", "", "third-person plural"], 

369 [ 

370 "verb", 

371 "^dass ich ", 

372 "", 

373 "first-person singular subordinate-clause", 

374 ], 

375 [ 

376 "verb", 

377 "^dass du ", 

378 "", 

379 "second-person singular subordinate-clause", 

380 ], 

381 [ 

382 "verb", 

383 "^dass er ", 

384 "", 

385 "third-person singular subordinate-clause", 

386 ], 

387 [ 

388 "verb", 

389 "^dass wir ", 

390 "", 

391 "first-person plural subordinate-clause", 

392 ], 

393 [ 

394 "verb", 

395 "^dass ihr ", 

396 "", 

397 "second-person plural subordinate-clause", 

398 ], 

399 [ 

400 "verb", 

401 "^dass sie ", 

402 "", 

403 "third-person plural subordinate-clause", 

404 ], 

405 ["verb", r" \(du\)$", "", "second-person singular"], 

406 ["verb", r" \(ihr\)$", "", "second-person plural"], 

407 ["adj", "^er ist ", "", "masculine singular"], 

408 ["adj", "^sie ist ", "", "feminine singular"], 

409 ["adj", "^es ist ", "", "neuter singular"], 

410 ["adj", "^sie sind ", "", "plural"], 

411 ["adj", "^keine ", "keine ", "negative"], 

412 ["adj", "^keiner ", "keiner ", "negative"], 

413 ["adj", "^keinen ", "keinen ", "negative"], 

414 ], 

415 "conditionally_ignored_cells": { 

416 "definite": [ 

417 "der", 

418 "die", 

419 "das", 

420 "des", 

421 "dem", 

422 "den", 

423 ], 

424 "indefinite": [ 

425 "ein", 

426 "eine", 

427 "eines", 

428 "einer", 

429 "einem", 

430 "einen", 

431 ], 

432 "negative": [ 

433 "kein", 

434 "keine", 

435 "keiner", 

436 "keinen", 

437 ], 

438 }, 

439 }, 

440 "German Low German": { 

441 "next": "German", 

442 "hdr_expand_first": set(["mood", "non-finite"]), 

443 "hdr_expand_cont": set(["tense"]), 

444 }, 

445 "Gothic": { 

446 "next": "Proto-Indo-European", # Has dual 

447 }, 

448 "Greek": { 

449 "next": "indo-european-group", 

450 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]), 

451 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]), 

452 "imperative_no_tense": True, 

453 "reuse_cellspan": "reuse", 

454 "skip_mood_mood": True, 

455 "skip_tense_tense": True, 

456 # είμαι/Greek 

457 "parentheses_for_informal": True, 

458 "square_brackets_for_rare": True, 

459 "curly_brackets_for_archaic": True, 

460 # For greek originally 

461 "minor_text_cleanups": { 

462 r"\s+➤\s*$": "", 

463 }, 

464 }, 

465 "Hawaiian": { 

466 "next": "austronesian-group", 

467 }, 

468 "Hebrew": { 

469 "next": "semitic-group", 

470 }, 

471 "Hijazi Arabic": { 

472 "next": "semitic-group", 

473 }, 

474 "Hopi": { 

475 "numbers": ["singular", "paucal", "plural"], 

476 }, 

477 "Hungarian": { 

478 "hdr_expand_first": set([]), 

479 "hdr_expand_cont": set([]), 

480 }, 

481 "Hunsrik": { 

482 "next": "German", 

483 }, 

484 "Icelandic": { 

485 "ignore_top_left_text_cell": True, 

486 }, 

487 "Ilokano": { 

488 "next": "austronesian-group", 

489 }, 

490 "Inari Sami": { 

491 "next": "samojedic-group", 

492 }, 

493 "Inuktitut": { 

494 "numbers": ["singular", "dual", "plural"], 

495 }, 

496 "Italian": { 

497 "next": "romance-group", 

498 "hdr_expand_first": set(["mood", "tense"]), 

499 "hdr_expand_cont": set(["person", "register", "number", "misc"]), 

500 "form_transformations": [ 

501 ["verb", "^non ", "", "negative"], 

502 ], 

503 }, 

504 "Irish": { 

505 "next": "Old Irish", 

506 "genders": ["masculine", "feminine"], 

507 }, 

508 "Kamba": { 

509 "next": "bantu-group", 

510 }, 

511 "Kapampangan": { 

512 "next": "austronesian-group", 

513 }, 

514 # "Khoe": { 

515 # "numbers": ["singular", "dual", "plural"], 

516 # }, 

517 "Kikuyu": { 

518 "next": "bantu-group", 

519 }, 

520 "Ladin": { 

521 "next": "romance-group", 

522 }, 

523 # "Larike": { 

524 # "numbers": ["singular", "dual", "trial", "plural"], 

525 # }, 

526 "Latin": { 

527 "next": "romance-group", 

528 "stop_non_finite_voice": True, 

529 }, 

530 "Latvian": { 

531 "empty_row_resets": True, 

532 }, 

533 "Ligurian": { 

534 "next": "romance-group", 

535 }, 

536 "Lihir": { 

537 "numbers": ["singular", "dual", "trial", "paucal", "plural"], 

538 }, 

539 "Lingala": { 

540 "next": "bantu-group", 

541 }, 

542 "Lombard": { 

543 "next": "romance-group", 

544 }, 

545 "Lower Sorbian": { 

546 "next": "slavic-group", 

547 }, 

548 "Luganda": { 

549 "next": "bantu-group", 

550 }, 

551 "Lule Sami": { 

552 "next": "samojedic-group", 

553 }, 

554 "Luxembourgish": { 

555 "next": "German", 

556 }, 

557 "Maltese": { 

558 "next": "semitic-group", 

559 }, 

560 "Maore Comorian": { 

561 "next": "bantu-group", 

562 }, 

563 "Masaba": { 

564 "next": "bantu-group", 

565 }, 

566 "Mirandese": { 

567 "next": "romance-group", 

568 }, 

569 "Moroccan Arabic": { 

570 "next": "semitic-group", 

571 }, 

572 # "Motuna": { 

573 # "numbers": ["singular", "paucal", "plural"], 

574 # }, 

575 "Mwali Comorian": { 

576 "next": "bantu-group", 

577 }, 

578 "Mwani": { 

579 "next": "bantu-group", 

580 }, 

581 "Navajo": { 

582 "numbers": [ 

583 "singular", 

584 "plural", 

585 "dual", 

586 "duoplural", 

587 ], 

588 }, 

589 "Neapolitan": { 

590 "next": "romance-group", 

591 }, 

592 "Nenets": { 

593 "next": "uralic-group", 

594 }, 

595 "Ngazidja Comorian": { 

596 "next": "bantu-group", 

597 }, 

598 "Niuean": { 

599 "next": "austronesian-group", 

600 }, 

601 "Northern Kurdish": { 

602 "numbers": ["singular", "paucal", "plural"], 

603 }, 

604 "Northern Ndebele": { 

605 "next": "bantu-group", 

606 }, 

607 "Northern Sami": { 

608 "next": "samojedic-group", 

609 }, 

610 # "Mussau": { 

611 # "numbers": ["singular", "dual", "trial", "plural"], 

612 # }, 

613 "Nyankole": { 

614 "next": "bantu-group", 

615 }, 

616 "Occitan": { 

617 "next": "romance-group", 

618 }, 

619 "Old Church Slavonic": { 

620 "next": "Proto-Indo-European", # Has dual 

621 }, 

622 "Old English": { 

623 "next": "Proto-Indo-European", # Had dual in pronouns 

624 }, 

625 "Old Norse": { 

626 "next": "Proto-Indo-European", # Had dual in pronouns 

627 }, 

628 "Old Irish": { 

629 "next": "Proto-Indo-European", # Has dual 

630 }, 

631 "Pennsylvania German": { 

632 "next": "German", 

633 }, 

634 "Phoenician": { 

635 "next": "semitic-group", 

636 }, 

637 "Phuthi": { 

638 "next": "bantu-group", 

639 }, 

640 "Pite Sami": { 

641 "next": "samojedic-group", 

642 }, 

643 "Polish": { 

644 "next": "slavic-group", 

645 }, 

646 "Portuguese": { 

647 "next": "romance-group", 

648 "genders": ["masculine", "feminine"], 

649 }, 

650 "Proto-Germanic": { 

651 "next": "Proto-Indo-European", # Has dual 

652 }, 

653 "Proto-Indo-European": { 

654 "numbers": ["singular", "dual", "plural"], 

655 }, 

656 "Proto-Samic": { 

657 "next": "samojedic-group", 

658 }, 

659 "Proto-Uralic": { 

660 "next": "uralic-group", 

661 }, 

662 "Raga": { 

663 "numbers": ["singular", "dual", "trial", "plural"], 

664 }, 

665 "Romagnol": { 

666 "next": "romance-group", 

667 }, 

668 "Romanian": { 

669 "next": "romance-group", 

670 }, 

671 "Romansch": { 

672 "next": "romance-group", 

673 }, 

674 "Russian": { 

675 "next": "slavic-group", 

676 "hdr_expand_first": set(["non-finite", "mood", "tense"]), 

677 "hdr_expand_cont": set(["tense", "number"]), 

678 "reuse_cellspan": "stop", 

679 }, 

680 "Rwanda-Rundi": { 

681 "next": "bantu-group", 

682 }, 

683 "Sanskrit": { 

684 "next": "Proto-Indo-European", 

685 }, 

686 "Sardinian": { 

687 "next": "romance-group", 

688 }, 

689 "Sassarese": { 

690 "next": "romance-group", 

691 }, 

692 "Scottish Gaelic": { 

693 "numbers": ["singular", "dual", "plural"], 

694 }, 

695 "Serbo-Croatian": { 

696 "next": "slavic-group", 

697 "numbers": ["singular", "dual", "paucal", "plural"], 

698 }, 

699 "Sicilian": { 

700 "next": "romance-group", 

701 }, 

702 "Skolt Sami": { 

703 "next": "samojedic-group", 

704 }, 

705 "Slovene": { 

706 "next": "slavic-group", 

707 }, 

708 "Shona": { 

709 "next": "bantu-group", 

710 }, 

711 "Sotho": { 

712 "next": "bantu-group", 

713 }, 

714 "South Levantine Arabic": { 

715 "next": "semitic-group", 

716 }, 

717 "Southern Ndebele": { 

718 "next": "bantu-group", 

719 }, 

720 "Spanish": { 

721 "next": "romance-group", 

722 "form_transformations": [ 

723 ["verb", "^no ", "", "negative"], 

724 ], 

725 "special_references": { 

726 "vos": "informal vos-form second-person singular", 

727 "ᵛᵒˢ": "informal vos-form second-person singular", 

728 "tú": "informal second-person singular", 

729 }, 

730 }, 

731 "Swahili": { 

732 "next": "bantu-group", 

733 }, 

734 "Swedish": { 

735 "hdr_expand_first": set(["referent"]), 

736 "hdr_expand_cont": set(["degree", "polarity"]), 

737 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

738 }, 

739 "Swazi": { 

740 "next": "bantu-group", 

741 }, 

742 # "Syriac": { 

743 # "next": "semitic-group", 

744 # }, 

745 "Tagalog": { 

746 "next": "austronesian-group", 

747 }, 

748 "Tausug": { 

749 "next": "austronesian-group", 

750 }, 

751 "Tigre": { 

752 "next": "semitic-group", 

753 }, 

754 "Tigrinya": { 

755 "next": "semitic-group", 

756 }, 

757 "Tongan": { 

758 "next": "austronesian-group", 

759 }, 

760 "Tsonga": { 

761 "next": "bantu-group", 

762 }, 

763 "Tswana": { 

764 "next": "bantu-group", 

765 }, 

766 "Tumbuka": { 

767 "next": "bantu-group", 

768 }, 

769 # "Tuscan": { 

770 # "next": "romance-group", 

771 # }, 

772 "Ugaritic": { 

773 "next": "semitic-group", 

774 }, 

775 "Ukrainian": { 

776 "next": "slavic-group", 

777 }, 

778 "Upper Sorbian": { 

779 "next": "slavic-group", 

780 }, 

781 # "Valencian": { 

782 # "next": "romance-group", 

783 # }, 

784 "Venetian": { 

785 "next": "romance-group", 

786 }, 

787 "Warlpiri": { 

788 "numbers": ["singular", "paucal", "plural"], 

789 }, 

790 "Xhosa": { 

791 "next": "bantu-group", 

792 }, 

793 "Zulu": { 

794 "next": "bantu-group", 

795 }, 

796 "ǃXóõ": { 

797 "next": "bantu-group", 

798 }, 

799} 

800 

801 

802# Sanity check lang_specific 

803# def_ls_keys = lang_specific["default"].keys() 

804# for k, v in lang_specific.items(): 

805# if k[0].isupper() and k not in languages_by_name: 

806# raise AssertionError( 

807# "key {!r} in lang_specific is not a valid language" 

808# .format(k)) 

809# assert isinstance(v, dict) 

810# for kk, vv in v.items(): 

811# if kk not in def_ls_keys and kk != "next": 

812# raise AssertionError("{} key {!r} not in default entry" 

813# .format(k, kk)) 

814# if kk in ("hdr_expand_first", "hdr_expand_cont"): 

815# if not isinstance(vv, set): 

816# raise AssertionError("{} key {!r} must be set" 

817# .format(lang, kk)) 

818# for t in vv: 

819# if t not in tag_categories: 

820# raise AssertionError("{} key {!r} invalid tag category {}" 

821# .format(k, kk, t)) 

822# elif kk in ("genders", "numbers", "persons", "strengths", "voices"): 

823# if not vv: 

824# continue 

825# if not isinstance(vv, (list, tuple, set)): 

826# raise AssertionError("{} key {!r} must be list/tuple/set" 

827# .format(k, kk)) 

828# for t in vv: 

829# if t not in valid_tags: 

830# raise AssertionError("{} key {!r} invalid tag {!r}" 

831# .format(k, kk, t)) 

832# elif kk == "lang_tag_mappings" and vv is not None: 

833# for pos, transf in vv.items(): 

834# assert pos in PARTS_OF_SPEECH 

835# assert isinstance(transf, dict) 

836# for pre, post in transf.items(): 

837# assert isinstance(pre, tuple) 

838# assert all(t in valid_tags for t in pre) 

839# assert isinstance(post, list) 

840# assert all(t in valid_tags for t in post) 

841# elif kk == "next": 

842# if vv not in lang_specific: 

843# raise AssertionError("{} key {!r} value {!r} is not defined" 

844# .format(k, kk, vv)) 

845 

846 

847def get_lang_conf(lang, field): 

848 """Returns the given field from language-specific data or "default" 

849 if the language is not listed or does not have the field.""" 

850 assert isinstance(lang, str) 

851 assert isinstance(field, str) 

852 while True: 

853 lconfigs = lang_specific.get(lang) 

854 if lconfigs is None: 

855 lang = "default" 

856 elif lang == "default" and field not in lconfigs: 856 ↛ 857line 856 didn't jump to line 857 because the condition on line 856 was never true

857 raise RuntimeError("Invalid lang_specific field {!r}".format(field)) 

858 else: 

859 if field in lconfigs: 

860 return lconfigs[field] 

861 lang = lconfigs.get("next", "default") 

862 

863 

864def lang_specific_tags(lang, pos, form): 

865 """Extracts tags from the word form itself in a language-specific way. 

866 This may also adjust the word form. 

867 For example, German inflected verb forms don't have person and number 

868 specified in the table, but include a pronoun. This returns adjusted 

869 form and a list of tags.""" 

870 assert isinstance(lang, str) 

871 assert isinstance(pos, str) 

872 assert isinstance(form, str) 

873 rules = get_lang_conf(lang, "form_transformations") 

874 for patpos, pattern, dst, tags in rules: 

875 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > "" 

876 assert patpos in PARTS_OF_SPEECH 

877 if pos != patpos: 

878 continue 

879 m = re.search(pattern, form) 

880 if not m: 

881 continue 

882 form = form[: m.start()] + dst + form[m.end() :] 

883 tags = tags.split() 

884 for t in tags: 

885 assert t in valid_tags 

886 return form, tags 

887 return form, []