Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 96%

36 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1# Language-specific configuration for various aspects of inflection table 

2# parsing. 

3 

4import re 

5from typing import Optional, TypedDict, Union 

6 

7from ...tags import valid_tags 

8from .parts_of_speech import PARTS_OF_SPEECH 

9 

10LangConfDict = TypedDict( 

11 "LangConfDict", 

12 { 

13 "next": str, 

14 "hdr_expand_first": set[str], 

15 "hdr_expand_cont": set[str], 

16 "animate_inanimate_remove": bool, 

17 "both_active_passive_remove": bool, 

18 "both_strong_weak_remove": bool, 

19 "definitenesses": list[str], 

20 "empty_row_resets": bool, 

21 "form_transformations": list[ 

22 list[str] 

23 ], # tag extraction, lang_specific_tags() 

24 "genders": Optional[list[str]], 

25 "imperative_no_tense": bool, 

26 "masc_only_animate": bool, # Slavic special 

27 "numbers": list[str], 

28 "persons": list[str], 

29 "pl_virile_nonvirile": bool, 

30 "reuse_cellspan": str, # stop/skip/reuse 

31 "skip_mood_mood": bool, 

32 "skip_tense_tense": bool, 

33 "stop_non_finite_non_finite": bool, 

34 "stop_non_finite_voice": bool, 

35 "stop_non_finite_tense": bool, 

36 "strengths": list[str], 

37 "virile_nonvirile_remove": bool, 

38 "voices": list[str], 

39 "special_phrase_splits": dict[ 

40 str, list[Union[list[str], str]] 

41 ], # value: (split phrase, tags) 

42 "form_replacements": dict[ 

43 str, Union[str, list[str]] 

44 ], # value: [replacement, tags] 

45 # Greek-style bracket semantics 

46 "parentheses_for_informal": bool, 

47 "square_brackets_for_rare": bool, 

48 "curly_brackets_for_archaic": bool, 

49 # Armenian; migrated old data here 

50 "lang_tag_mappings": Optional[ 

51 dict[str, dict[tuple[str, ...], list[str]]] 

52 ], 

53 # Spanish has a lot of "vos" and "tú" in its tables that look like 

54 # references, and they give their form certain tags. 

55 # Dict of references ("vos") that point to tag strings "first-person 

56 # singular" that *extend* tags. 

57 "special_references": Optional[dict[str, str]], 

58 # Some languages like Icelandic and Faroese have text cells in the 

59 # upper left that we'd like to ignore. 

60 "ignore_top_left_text_cell": bool, 

61 # Minor regex replacements for cleanup in parse_simple_table() 

62 "minor_text_cleanups": Optional[ 

63 dict[str, str] 

64 ], # dict of {regex: substitution} 

65 "articles_in_separate_columns": bool, 

66 # Cells to ignore in this language, unless the cell has the key 

67 # as a tag. 

68 "conditionally_ignored_cells": dict[str, list[str]], 

69 }, 

70 total=False, 

71) 

72 

73lang_specific: dict[str, LangConfDict] = { 

74 "default": { 

75 "hdr_expand_first": set( 

76 [ 

77 "number", 

78 "mood", 

79 "referent", 

80 "aspect", 

81 "tense", 

82 "voice", 

83 "non-finite", 

84 "case", 

85 "possession", 

86 ] 

87 ), 

88 "hdr_expand_cont": set( 

89 [ 

90 "person", 

91 "gender", 

92 "number", 

93 "degree", 

94 "polarity", 

95 "voice", 

96 "misc", 

97 ] 

98 ), 

99 "animate_inanimate_remove": True, 

100 "both_active_passive_remove": True, 

101 "both_strong_weak_remove": True, 

102 "definitenesses": ["indefinite", "definite"], 

103 "empty_row_resets": False, 

104 "form_transformations": [], # tag extraction, lang_specific_tags() 

105 "genders": None, 

106 "imperative_no_tense": False, 

107 "masc_only_animate": False, # Slavic special 

108 "numbers": ["singular", "plural"], 

109 "persons": ["first-person", "second-person", "third-person"], 

110 "pl_virile_nonvirile": False, 

111 "reuse_cellspan": "skip", # stop/skip/reuse 

112 "skip_mood_mood": False, 

113 "skip_tense_tense": False, 

114 "stop_non_finite_non_finite": True, 

115 "stop_non_finite_voice": False, 

116 "stop_non_finite_tense": False, 

117 "strengths": ["strong", "weak"], 

118 "virile_nonvirile_remove": True, 

119 "voices": ["active", "passive"], 

120 "special_phrase_splits": {}, # value: (split phrase, tags) 

121 "form_replacements": {}, # value: [replacement, tags] 

122 # Greek-style bracket semantics 

123 "parentheses_for_informal": False, 

124 "square_brackets_for_rare": False, 

125 "curly_brackets_for_archaic": False, 

126 # Armenian; migrated old data here 

127 "lang_tag_mappings": None, 

128 # Spanish has a lot of "vos" and "tú" in its tables that look like 

129 # references, and they give their form certain tags. 

130 # Dict of references ("vos") that point to tag strings "first-person 

131 # singular" that *extend* tags. 

132 "special_references": None, 

133 # Some languages like Icelandic and Faroese have text cells in the 

134 # upper left that we'd like to ignore. 

135 "ignore_top_left_text_cell": False, 

136 # Minor regex replacements for cleanup in parse_simple_table() 

137 "minor_text_cleanups": None, # dict of {regex: substitution} 

138 "articles_in_separate_columns": False, 

139 # Cells to ignore in this language, unless the cell has the key 

140 # as a tag. 

141 "conditionally_ignored_cells": {}, 

142 }, 

143 "austronesian-group": { 

144 "numbers": ["singular", "dual", "plural"], 

145 }, 

146 "bantu-group": { 

147 "genders": None, 

148 }, 

149 "indo-european-group": { 

150 "genders": ["masculine", "feminine", "neuter"], 

151 "numbers": ["singular", "plural"], 

152 }, 

153 "romance-group": {}, 

154 "slavic-group": { 

155 "numbers": ["singular", "plural", "dual"], 

156 "masc_only_animate": True, 

157 }, 

158 "samojedic-group": { 

159 "next": "uralic-group", 

160 }, 

161 "semitic-group": { 

162 "numbers": ["singular", "dual", "plural"], 

163 "definitenesses": ["indefinite", "definite", "construct"], 

164 }, 

165 "uralic-group": { 

166 "numbers": ["singular", "dual", "plural"], 

167 }, 

168 "german-group": { # languages closely related to or offshot from German 

169 "next": "germanic-group", 

170 "articles_in_separate_columns": True, 

171 }, 

172 "germanic-group": { # Germanic languages as a whole 

173 "next": "indo-european-group", 

174 }, 

175 "Akkadian": { 

176 "next": "semitic-group", 

177 }, 

178 "Alemannic German": { 

179 "next": "German", 

180 }, 

181 "Amharic": { 

182 "next": "semitic-group", 

183 }, 

184 "Ancient Greek": { 

185 "next": "Proto-Indo-European", # Has dual 

186 "form_transformations": [ 

187 # Used to remove the gendered article alternatives at the start 

188 # of table entries like ἰχθυοκένταυρος / Ancient Greek 

189 ["noun", "^ὁ, ἡ ", "", ""], 

190 ["noun", "^τὼ ", "", ""], 

191 ["noun", "^οἱ, αἱ ", "", ""], 

192 ["noun", "^τοῦ, τῆς ", "", ""], 

193 ["noun", "^τοῖν ", "", ""], 

194 ["noun", "^τῶν ", "", ""], 

195 ["noun", "^τῷ, τῇ ", "", ""], 

196 ["noun", "^τοῖς, ταῖς ", "", ""], 

197 ["noun", "^τὸν, τὴν ", "", ""], 

198 ["noun", "^τὼ ", "", ""], 

199 ["noun", "^τοὺς, τᾱ̀ς ", "", ""], 

200 ["noun", "(?m)^ho, hē ", "", ""], 

201 ["noun", "(?m)^tṑ ", "", ""], 

202 ["noun", "(?m)^hoi, hai ", "", ""], 

203 ["noun", "(?m)^toû, tês", "", ""], 

204 ["noun", "(?m)^toîn ", "", ""], 

205 ["noun", "(?m)^tôn ", "", ""], 

206 ["noun", "(?m)^tôi, têi ", "", ""], 

207 ["noun", "(?m)^toîs, taîs ", "", ""], 

208 ["noun", "(?m)^tòn, tḕn ", "", ""], 

209 ["noun", "(?m)^tṑ ", "", ""], 

210 ["noun", "(?m)^toùs, tā̀s ", "", ""], 

211 ], 

212 }, 

213 # "Anejom̃": { 

214 # "numbers": ["singular", "dual", "trial", "plural"], 

215 # }, 

216 "Arabic": { 

217 "next": "semitic-group", 

218 "numbers": [ 

219 "singular", 

220 "dual", 

221 "paucal", 

222 "plural", 

223 "collective", 

224 "singulative", 

225 ], 

226 "reuse_cellspan": "reuse", 

227 "hdr_expand_first": set(["number"]), 

228 "hdr_expand_cont": set( 

229 ["gender", "referent", "misc", "number", "class"] 

230 ), 

231 }, 

232 "Aragonese": { 

233 "next": "romance-group", 

234 }, 

235 "Armenian": { 

236 "lang_tag_mappings": { 

237 "noun": { 

238 ("possessive", "singular"): ["possessive", "possessed-single"], 

239 ("possessive", "plural"): ["possessive", "possessed-single"], 

240 }, 

241 }, 

242 }, 

243 "Aromanian": { 

244 "next": "romance-group", 

245 }, 

246 "Aramaic": { 

247 "next": "semitic-group", 

248 }, 

249 "Avestan": { 

250 "next": "Proto-Indo-European", 

251 }, 

252 "Bavarian": { 

253 "next": "German", 

254 }, 

255 "Baiso": { 

256 "numbers": ["singular", "paucal", "plural"], 

257 }, 

258 "Belarusian": { 

259 "next": "slavic-group", 

260 }, 

261 "Bende": { 

262 "next": "bantu-group", 

263 }, 

264 # "Berber": { 

265 # "definitenesses": ["indefinite", "definite", "construct"], 

266 # }, 

267 "Catalan": { 

268 "next": "romance-group", 

269 }, 

270 "Chichewa": { 

271 "next": "bantu-group", 

272 }, 

273 "Chimwiini": { 

274 "next": "bantu-group", 

275 }, 

276 "Cimbrian": { 

277 "next": "German", 

278 }, 

279 "Corsican": { 

280 "next": "romance-group", 

281 }, 

282 "Czech": { 

283 "next": "slavic-group", 

284 "hdr_expand_first": set(["tense", "mood", "non-finite"]), 

285 "hdr_expand_cont": set(["tense", "mood", "voice"]), 

286 }, 

287 "Dalmatian": { 

288 "next": "romance-group", 

289 }, 

290 "Danish": { 

291 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

292 "form_transformations": [ 

293 ["noun", r"^\(as a measure\) ", "", ""], 

294 ], 

295 }, 

296 "Eblaite": { 

297 "next": "semitic-group", 

298 }, 

299 "Egyptian": { 

300 "definitenesses": ["indefinite", "definite", "construct"], 

301 }, 

302 "Emilian": { 

303 "next": "romance-group", 

304 }, 

305 "English": { 

306 "stop_non_finite_tense": True, # affect/English/Verb 

307 "form_transformations": [ 

308 ["verb", r"^\(to\) ", "", ""], 

309 ["verb", "^to ", "", ""], 

310 ["verb", r"^I ", "", "first-person singular"], 

311 ["verb", r"^you ", "", "second-person"], 

312 ["verb", r"^he ", "", "third-person singular"], 

313 ["verb", r"^we ", "", "first-person plural"], 

314 ["verb", r"^they ", "", "third-person"], 

315 ["verb", r"^it ", "", "third-person singular"], 

316 ["verb", r"^thou ", "", "second-person singular"], 

317 ["verb", r"^ye ", "", "second-person plural"], 

318 ["verb", r" \(thou\)$", "", "second-person singular"], 

319 ["verb", r" \(ye\)$", "", "second-person plural"], 

320 ["verb", r"^he/she/it ", "", "third-person singular"], 

321 ["verb", r"^he/she/it/they ", "", "third-person singular"], 

322 ["verb", r"\bhim/her/it/them ", "", "third-person singular"], 

323 ["verb", r"\bthem ", "", "third-person"], 

324 ["verb", r"\bus ", "", "first-person plural"], 

325 ["verb", r"\bme ", "", "first-person singular"], 

326 ], 

327 "form_replacements": { 

328 "let’s be": ["let's be", "first-person plural pronoun-included"], 

329 }, 

330 "special_phrase_splits": { 

331 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"], 

332 "we are (’re)/be/been": [ 

333 ["are (’re)", "be", "been"], 

334 "first-person plural", 

335 ], 

336 "thou art (’rt)/beest": [ 

337 ["art (’rt)", "beest"], 

338 "second-person singular", 

339 ], 

340 "ye are (’re)/be/been": [ 

341 ["are (’re)", "be", "been"], 

342 "second-person plural", 

343 ], 

344 "thou be/beest": [["be", "beest"], "second-person singular"], 

345 "he/she/it is (’s)/beeth/bes": [ 

346 ["is (’s)", "beeth", "bes"], 

347 "third-person singular", 

348 ], 

349 "they are (’re)/be/been": [ 

350 ["are (’re)", "be", "been"], 

351 "third-person plural", 

352 ], 

353 "thou wert/wast": [["wert", "wast"], "second-person singular"], 

354 "thou were/wert": [["were", "wert"], "second-person singular"], 

355 "there has been": [["there has been"], "singular"], 

356 "there have been": [["there have been"], "plural"], 

357 "there is ('s)": [["there is", "there's"], "singular"], 

358 "there are ('re)": [["there are", "there're"], "plural"], 

359 "there was": [["there was"], "singular"], 

360 "there were": [["there were"], "plural"], 

361 }, 

362 }, 

363 "Estonian": { 

364 "hdr_expand_first": set(["non-finite"]), 

365 "hdr_expand_cont": set(["voice"]), 

366 }, 

367 "Faroese": { 

368 "ignore_top_left_text_cell": True, 

369 }, 

370 "Fijian": { 

371 "numbers": ["singular", "paucal", "plural"], 

372 }, 

373 "Finnish": { 

374 "hdr_expand_first": set([]), 

375 }, 

376 "French": { 

377 "next": "romance-group", 

378 }, 

379 "Friulian": { 

380 "next": "romance-group", 

381 }, 

382 "Galician": { 

383 "next": "romance-group", 

384 }, 

385 "German": { 

386 "next": "german-group", 

387 "form_transformations": [ 

388 ["verb", "^ich ", "", "first-person singular"], 

389 ["verb", "^du ", "", "second-person singular"], 

390 ["verb", "^er ", "", "third-person singular"], 

391 ["verb", "^wir ", "", "first-person plural"], 

392 ["verb", "^ihr ", "", "second-person plural"], 

393 ["verb", "^sie ", "", "third-person plural"], 

394 [ 

395 "verb", 

396 "^dass ich ", 

397 "", 

398 "first-person singular subordinate-clause", 

399 ], 

400 [ 

401 "verb", 

402 "^dass du ", 

403 "", 

404 "second-person singular subordinate-clause", 

405 ], 

406 [ 

407 "verb", 

408 "^dass er ", 

409 "", 

410 "third-person singular subordinate-clause", 

411 ], 

412 [ 

413 "verb", 

414 "^dass wir ", 

415 "", 

416 "first-person plural subordinate-clause", 

417 ], 

418 [ 

419 "verb", 

420 "^dass ihr ", 

421 "", 

422 "second-person plural subordinate-clause", 

423 ], 

424 [ 

425 "verb", 

426 "^dass sie ", 

427 "", 

428 "third-person plural subordinate-clause", 

429 ], 

430 ["verb", r" \(du\)$", "", "second-person singular"], 

431 ["verb", r" \(ihr\)$", "", "second-person plural"], 

432 ["adj", "^er ist ", "", "masculine singular"], 

433 ["adj", "^sie ist ", "", "feminine singular"], 

434 ["adj", "^es ist ", "", "neuter singular"], 

435 ["adj", "^sie sind ", "", "plural"], 

436 ["adj", "^keine ", "keine ", "negative"], 

437 ["adj", "^keiner ", "keiner ", "negative"], 

438 ["adj", "^keinen ", "keinen ", "negative"], 

439 ], 

440 "conditionally_ignored_cells": { 

441 "definite": [ 

442 "der", 

443 "die", 

444 "das", 

445 "des", 

446 "dem", 

447 "den", 

448 ], 

449 "indefinite": [ 

450 "ein", 

451 "eine", 

452 "eines", 

453 "einer", 

454 "einem", 

455 "einen", 

456 ], 

457 "negative": [ 

458 "kein", 

459 "keine", 

460 "keiner", 

461 "keinen", 

462 ], 

463 }, 

464 }, 

465 "German Low German": { 

466 "next": "German", 

467 "hdr_expand_first": set(["mood", "non-finite"]), 

468 "hdr_expand_cont": set(["tense"]), 

469 }, 

470 "Gothic": { 

471 "next": "Proto-Indo-European", # Has dual 

472 }, 

473 "Greek": { 

474 "next": "indo-european-group", 

475 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]), 

476 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]), 

477 "imperative_no_tense": True, 

478 "reuse_cellspan": "reuse", 

479 "skip_mood_mood": True, 

480 "skip_tense_tense": True, 

481 # είμαι/Greek 

482 "parentheses_for_informal": True, 

483 "square_brackets_for_rare": True, 

484 "curly_brackets_for_archaic": True, 

485 # For greek originally 

486 "minor_text_cleanups": { 

487 r"\s+➤\s*$": "", 

488 }, 

489 }, 

490 "Hawaiian": { 

491 "next": "austronesian-group", 

492 }, 

493 "Hebrew": { 

494 "next": "semitic-group", 

495 }, 

496 "Hijazi Arabic": { 

497 "next": "semitic-group", 

498 }, 

499 "Hopi": { 

500 "numbers": ["singular", "paucal", "plural"], 

501 }, 

502 "Hungarian": { 

503 "hdr_expand_first": set([]), 

504 "hdr_expand_cont": set([]), 

505 }, 

506 "Hunsrik": { 

507 "next": "German", 

508 }, 

509 "Icelandic": { 

510 "ignore_top_left_text_cell": True, 

511 }, 

512 "Ilokano": { 

513 "next": "austronesian-group", 

514 }, 

515 "Inari Sami": { 

516 "next": "samojedic-group", 

517 }, 

518 "Inuktitut": { 

519 "numbers": ["singular", "dual", "plural"], 

520 }, 

521 "Italian": { 

522 "next": "romance-group", 

523 "hdr_expand_first": set(["mood", "tense"]), 

524 "hdr_expand_cont": set(["person", "register", "number", "misc"]), 

525 "form_transformations": [ 

526 ["verb", "^non ", "", "negative"], 

527 ], 

528 }, 

529 "Irish": { 

530 "next": "Old Irish", 

531 "genders": ["masculine", "feminine"], 

532 }, 

533 "Kamba": { 

534 "next": "bantu-group", 

535 }, 

536 "Kapampangan": { 

537 "next": "austronesian-group", 

538 }, 

539 # "Khoe": { 

540 # "numbers": ["singular", "dual", "plural"], 

541 # }, 

542 "Kikuyu": { 

543 "next": "bantu-group", 

544 }, 

545 "Ladin": { 

546 "next": "romance-group", 

547 }, 

548 # "Larike": { 

549 # "numbers": ["singular", "dual", "trial", "plural"], 

550 # }, 

551 "Latin": { 

552 "next": "romance-group", 

553 "stop_non_finite_voice": True, 

554 }, 

555 "Latvian": { 

556 "empty_row_resets": True, 

557 }, 

558 "Ligurian": { 

559 "next": "romance-group", 

560 }, 

561 "Lihir": { 

562 "numbers": ["singular", "dual", "trial", "paucal", "plural"], 

563 }, 

564 "Lingala": { 

565 "next": "bantu-group", 

566 }, 

567 "Lombard": { 

568 "next": "romance-group", 

569 }, 

570 "Lower Sorbian": { 

571 "next": "slavic-group", 

572 }, 

573 "Luganda": { 

574 "next": "bantu-group", 

575 }, 

576 "Lule Sami": { 

577 "next": "samojedic-group", 

578 }, 

579 "Luxembourgish": { 

580 "next": "German", 

581 }, 

582 "Maltese": { 

583 "next": "semitic-group", 

584 }, 

585 "Maore Comorian": { 

586 "next": "bantu-group", 

587 }, 

588 "Masaba": { 

589 "next": "bantu-group", 

590 }, 

591 "Mirandese": { 

592 "next": "romance-group", 

593 }, 

594 "Moroccan Arabic": { 

595 "next": "semitic-group", 

596 }, 

597 # "Motuna": { 

598 # "numbers": ["singular", "paucal", "plural"], 

599 # }, 

600 "Mwali Comorian": { 

601 "next": "bantu-group", 

602 }, 

603 "Mwani": { 

604 "next": "bantu-group", 

605 }, 

606 "Navajo": { 

607 "numbers": [ 

608 "singular", 

609 "plural", 

610 "dual", 

611 "duoplural", 

612 ], 

613 }, 

614 "Neapolitan": { 

615 "next": "romance-group", 

616 }, 

617 "Nenets": { 

618 "next": "uralic-group", 

619 }, 

620 "Ngazidja Comorian": { 

621 "next": "bantu-group", 

622 }, 

623 "Niuean": { 

624 "next": "austronesian-group", 

625 }, 

626 "Northern Kurdish": { 

627 "numbers": ["singular", "paucal", "plural"], 

628 }, 

629 "Northern Ndebele": { 

630 "next": "bantu-group", 

631 }, 

632 "Northern Sami": { 

633 "next": "samojedic-group", 

634 }, 

635 # "Mussau": { 

636 # "numbers": ["singular", "dual", "trial", "plural"], 

637 # }, 

638 "Nyankole": { 

639 "next": "bantu-group", 

640 }, 

641 "Occitan": { 

642 "next": "romance-group", 

643 }, 

644 "Old Church Slavonic": { 

645 "next": "Proto-Indo-European", # Has dual 

646 }, 

647 "Old English": { 

648 "next": "Proto-Indo-European", # Had dual in pronouns 

649 }, 

650 "Old Norse": { 

651 "next": "Proto-Indo-European", # Had dual in pronouns 

652 }, 

653 "Old Irish": { 

654 "next": "Proto-Indo-European", # Has dual 

655 }, 

656 "Pennsylvania German": { 

657 "next": "German", 

658 }, 

659 "Phoenician": { 

660 "next": "semitic-group", 

661 }, 

662 "Phuthi": { 

663 "next": "bantu-group", 

664 }, 

665 "Pite Sami": { 

666 "next": "samojedic-group", 

667 }, 

668 "Polish": { 

669 "next": "slavic-group", 

670 }, 

671 "Portuguese": { 

672 "next": "romance-group", 

673 "genders": ["masculine", "feminine"], 

674 }, 

675 "Proto-Germanic": { 

676 "next": "Proto-Indo-European", # Has dual 

677 }, 

678 "Proto-Indo-European": { 

679 "numbers": ["singular", "dual", "plural"], 

680 }, 

681 "Proto-Samic": { 

682 "next": "samojedic-group", 

683 }, 

684 "Proto-Uralic": { 

685 "next": "uralic-group", 

686 }, 

687 "Raga": { 

688 "numbers": ["singular", "dual", "trial", "plural"], 

689 }, 

690 "Romagnol": { 

691 "next": "romance-group", 

692 }, 

693 "Romanian": { 

694 "next": "romance-group", 

695 }, 

696 "Romansch": { 

697 "next": "romance-group", 

698 }, 

699 "Russian": { 

700 "next": "slavic-group", 

701 "hdr_expand_first": set(["non-finite", "mood", "tense"]), 

702 "hdr_expand_cont": set(["tense", "number"]), 

703 "reuse_cellspan": "stop", 

704 }, 

705 "Rwanda-Rundi": { 

706 "next": "bantu-group", 

707 }, 

708 "Sanskrit": { 

709 "next": "Proto-Indo-European", 

710 }, 

711 "Sardinian": { 

712 "next": "romance-group", 

713 }, 

714 "Sassarese": { 

715 "next": "romance-group", 

716 }, 

717 "Scottish Gaelic": { 

718 "numbers": ["singular", "dual", "plural"], 

719 }, 

720 "Serbo-Croatian": { 

721 "next": "slavic-group", 

722 "numbers": ["singular", "dual", "paucal", "plural"], 

723 }, 

724 "Sicilian": { 

725 "next": "romance-group", 

726 }, 

727 "Skolt Sami": { 

728 "next": "samojedic-group", 

729 }, 

730 "Slovene": { 

731 "next": "slavic-group", 

732 }, 

733 "Shona": { 

734 "next": "bantu-group", 

735 }, 

736 "Sotho": { 

737 "next": "bantu-group", 

738 }, 

739 "South Levantine Arabic": { 

740 "next": "semitic-group", 

741 }, 

742 "Southern Ndebele": { 

743 "next": "bantu-group", 

744 }, 

745 "Spanish": { 

746 "next": "romance-group", 

747 "form_transformations": [ 

748 ["verb", "^no ", "", "negative"], 

749 ], 

750 "special_references": { 

751 "vos": "informal vos-form second-person singular", 

752 "ᵛᵒˢ": "informal vos-form second-person singular", 

753 "tú": "informal second-person singular", 

754 }, 

755 }, 

756 "Swahili": { 

757 "next": "bantu-group", 

758 }, 

759 "Swedish": { 

760 "hdr_expand_first": set(["referent"]), 

761 "hdr_expand_cont": set(["degree", "polarity"]), 

762 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

763 }, 

764 "Swazi": { 

765 "next": "bantu-group", 

766 }, 

767 # "Syriac": { 

768 # "next": "semitic-group", 

769 # }, 

770 "Tagalog": { 

771 "next": "austronesian-group", 

772 }, 

773 "Tausug": { 

774 "next": "austronesian-group", 

775 }, 

776 "Tigre": { 

777 "next": "semitic-group", 

778 }, 

779 "Tigrinya": { 

780 "next": "semitic-group", 

781 }, 

782 "Tongan": { 

783 "next": "austronesian-group", 

784 }, 

785 "Tsonga": { 

786 "next": "bantu-group", 

787 }, 

788 "Tswana": { 

789 "next": "bantu-group", 

790 }, 

791 "Tumbuka": { 

792 "next": "bantu-group", 

793 }, 

794 # "Tuscan": { 

795 # "next": "romance-group", 

796 # }, 

797 "Ugaritic": { 

798 "next": "semitic-group", 

799 }, 

800 "Ukrainian": { 

801 "next": "slavic-group", 

802 }, 

803 "Upper Sorbian": { 

804 "next": "slavic-group", 

805 }, 

806 # "Valencian": { 

807 # "next": "romance-group", 

808 # }, 

809 "Venetian": { 

810 "next": "romance-group", 

811 }, 

812 "Warlpiri": { 

813 "numbers": ["singular", "paucal", "plural"], 

814 }, 

815 "Xhosa": { 

816 "next": "bantu-group", 

817 }, 

818 "Zulu": { 

819 "next": "bantu-group", 

820 }, 

821 "ǃXóõ": { 

822 "next": "bantu-group", 

823 }, 

824} 

825 

826 

827# Sanity check lang_specific 

828# def_ls_keys = lang_specific["default"].keys() 

829# for k, v in lang_specific.items(): 

830# if k[0].isupper() and k not in languages_by_name: 

831# raise AssertionError( 

832# "key {!r} in lang_specific is not a valid language" 

833# .format(k)) 

834# assert isinstance(v, dict) 

835# for kk, vv in v.items(): 

836# if kk not in def_ls_keys and kk != "next": 

837# raise AssertionError("{} key {!r} not in default entry" 

838# .format(k, kk)) 

839# if kk in ("hdr_expand_first", "hdr_expand_cont"): 

840# if not isinstance(vv, set): 

841# raise AssertionError("{} key {!r} must be set" 

842# .format(lang, kk)) 

843# for t in vv: 

844# if t not in tag_categories: 

845# raise AssertionError("{} key {!r} invalid tag category {}" 

846# .format(k, kk, t)) 

847# elif kk in ("genders", "numbers", "persons", "strengths", "voices"): 

848# if not vv: 

849# continue 

850# if not isinstance(vv, (list, tuple, set)): 

851# raise AssertionError("{} key {!r} must be list/tuple/set" 

852# .format(k, kk)) 

853# for t in vv: 

854# if t not in valid_tags: 

855# raise AssertionError("{} key {!r} invalid tag {!r}" 

856# .format(k, kk, t)) 

857# elif kk == "lang_tag_mappings" and vv is not None: 

858# for pos, transf in vv.items(): 

859# assert pos in PARTS_OF_SPEECH 

860# assert isinstance(transf, dict) 

861# for pre, post in transf.items(): 

862# assert isinstance(pre, tuple) 

863# assert all(t in valid_tags for t in pre) 

864# assert isinstance(post, list) 

865# assert all(t in valid_tags for t in post) 

866# elif kk == "next": 

867# if vv not in lang_specific: 

868# raise AssertionError("{} key {!r} value {!r} is not defined" 

869# .format(k, kk, vv)) 

870 

871 

872def get_lang_conf(lang, field): 

873 """Returns the given field from language-specific data or "default" 

874 if the language is not listed or does not have the field.""" 

875 assert isinstance(lang, str) 

876 assert isinstance(field, str) 

877 while True: 

878 lconfigs = lang_specific.get(lang) 

879 if lconfigs is None: 

880 lang = "default" 

881 elif lang == "default" and field not in lconfigs: 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true

882 raise RuntimeError("Invalid lang_specific field {!r}".format(field)) 

883 else: 

884 if field in lconfigs: 

885 return lconfigs[field] 

886 lang = lconfigs.get("next", "default") 

887 

888 

889def lang_specific_tags(lang, pos, form): 

890 """Extracts tags from the word form itself in a language-specific way. 

891 This may also adjust the word form. 

892 For example, German inflected verb forms don't have person and number 

893 specified in the table, but include a pronoun. This returns adjusted 

894 form and a list of tags.""" 

895 assert isinstance(lang, str) 

896 assert isinstance(pos, str) 

897 assert isinstance(form, str) 

898 rules = get_lang_conf(lang, "form_transformations") 

899 for patpos, pattern, dst, tags in rules: 

900 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > "" 

901 assert patpos in PARTS_OF_SPEECH 

902 if pos != patpos: 

903 continue 

904 m = re.search(pattern, form) 

905 if not m: 

906 continue 

907 form = form[: m.start()] + dst + form[m.end() :] 

908 tags = tags.split() 

909 for t in tags: 

910 assert t in valid_tags 

911 return form, tags 

912 return form, []