Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 82%

41 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1# Language-specific configuration for various aspects of inflection table 

2# parsing. 

3 

4import re 

5from typing import Optional, TypedDict, Union 

6 

7from ...tags import valid_tags 

8from .parts_of_speech import PARTS_OF_SPEECH 

9 

10LangConfDict = TypedDict( 

11 "LangConfDict", 

12 { 

13 "next": str, 

14 "hdr_expand_first": set[str], 

15 "hdr_expand_cont": set[str], 

16 "animate_inanimate_remove": bool, 

17 "both_active_passive_remove": bool, 

18 "both_strong_weak_remove": bool, 

19 "definitenesses": list[str], 

20 "empty_row_resets": bool, 

21 "form_transformations": list[ 

22 list[tuple[str, ...] | str] 

23 ], # tag extraction, lang_specific_tags() 

24 "genders": Optional[list[str]], 

25 "imperative_no_tense": bool, 

26 "masc_only_animate": bool, # Slavic special 

27 "numbers": list[str], 

28 "persons": list[str], 

29 "pl_virile_nonvirile": bool, 

30 "reuse_cellspan": str, # stop/skip/reuse 

31 "skip_mood_mood": bool, 

32 "skip_tense_tense": bool, 

33 "stop_non_finite_non_finite": bool, 

34 "stop_non_finite_voice": bool, 

35 "stop_non_finite_tense": bool, 

36 "strengths": list[str], 

37 "virile_nonvirile_remove": bool, 

38 "voices": list[str], 

39 "special_phrase_splits": dict[ 

40 str, list[Union[list[str], str]] 

41 ], # value: (split phrase, tags) 

42 "form_replacements": dict[ 

43 str, Union[str, list[str]] 

44 ], # value: [replacement, tags] 

45 # Greek-style bracket semantics 

46 "parentheses_for_informal": bool, 

47 "square_brackets_for_rare": bool, 

48 "curly_brackets_for_archaic": bool, 

49 # Armenian; migrated old data here 

50 "lang_tag_mappings": Optional[ 

51 dict[str, dict[tuple[str, ...], list[str]]] 

52 ], 

53 # Spanish has a lot of "vos" and "tú" in its tables that look like 

54 # references, and they give their form certain tags. 

55 # Dict of references ("vos") that point to tag strings "first-person 

56 # singular" that *extend* tags. 

57 "special_references": Optional[dict[str, str]], 

58 # Some languages like Icelandic and Faroese have text cells in the 

59 # upper left that we'd like to ignore. 

60 "ignore_top_left_text_cell": bool, 

61 # Minor regex replacements for cleanup in parse_simple_table() 

62 "minor_text_cleanups": Optional[ 

63 dict[str, str] 

64 ], # dict of {regex: substitution} 

65 "articles_in_separate_columns": bool, 

66 # Cells to ignore in this language, unless the cell has the key 

67 # as a tag. 

68 "conditionally_ignored_cells": dict[str, list[str]], 

69 "remove_text_patterns": dict[ 

70 tuple[str, ...], tuple[str | re.Pattern, ...] 

71 ] 

72 | None, 

73 }, 

74 total=False, 

75) 

76 

77lang_specific: dict[str, LangConfDict] = { 

78 "default": { 

79 "hdr_expand_first": set( 

80 [ 

81 "number", 

82 "mood", 

83 "referent", 

84 "aspect", 

85 "tense", 

86 "voice", 

87 "non-finite", 

88 "case", 

89 "possession", 

90 ] 

91 ), 

92 "hdr_expand_cont": set( 

93 [ 

94 "person", 

95 "gender", 

96 "number", 

97 "degree", 

98 "polarity", 

99 "voice", 

100 "misc", 

101 ] 

102 ), 

103 "animate_inanimate_remove": True, 

104 "both_active_passive_remove": True, 

105 "both_strong_weak_remove": True, 

106 "definitenesses": ["indefinite", "definite"], 

107 "empty_row_resets": False, 

108 "form_transformations": [], # tag extraction, lang_specific_tags() 

109 "genders": None, 

110 "imperative_no_tense": False, 

111 "masc_only_animate": False, # Slavic special 

112 "numbers": ["singular", "plural"], 

113 "persons": ["first-person", "second-person", "third-person"], 

114 "pl_virile_nonvirile": False, 

115 "reuse_cellspan": "skip", # stop/skip/reuse 

116 "skip_mood_mood": False, 

117 "skip_tense_tense": False, 

118 "stop_non_finite_non_finite": True, 

119 "stop_non_finite_voice": False, 

120 "stop_non_finite_tense": False, 

121 "strengths": ["strong", "weak"], 

122 "virile_nonvirile_remove": True, 

123 "voices": ["active", "passive"], 

124 "special_phrase_splits": {}, # value: (split phrase, tags) 

125 "form_replacements": {}, # value: [replacement, tags] 

126 # Greek-style bracket semantics 

127 "parentheses_for_informal": False, 

128 "square_brackets_for_rare": False, 

129 "curly_brackets_for_archaic": False, 

130 # Armenian; migrated old data here 

131 "lang_tag_mappings": None, 

132 # Spanish has a lot of "vos" and "tú" in its tables that look like 

133 # references, and they give their form certain tags. 

134 # Dict of references ("vos") that point to tag strings "first-person 

135 # singular" that *extend* tags. 

136 "special_references": None, 

137 # Some languages like Icelandic and Faroese have text cells in the 

138 # upper left that we'd like to ignore. 

139 "ignore_top_left_text_cell": False, 

140 # Minor regex replacements for cleanup in parse_simple_table() 

141 "minor_text_cleanups": None, # dict of {regex: substitution} 

142 "articles_in_separate_columns": False, 

143 # Cells to ignore in this language, unless the cell has the key 

144 # as a tag. 

145 "conditionally_ignored_cells": {}, 

146 "remove_text_patterns": None, 

147 }, 

148 "austronesian-group": { 

149 "numbers": ["singular", "dual", "plural"], 

150 }, 

151 "bantu-group": { 

152 "genders": None, 

153 }, 

154 "indo-european-group": { 

155 "genders": ["masculine", "feminine", "neuter"], 

156 "numbers": ["singular", "plural"], 

157 }, 

158 "romance-group": {}, 

159 "slavic-group": { 

160 "numbers": ["singular", "plural", "dual"], 

161 "masc_only_animate": True, 

162 }, 

163 "samojedic-group": { 

164 "next": "uralic-group", 

165 }, 

166 "semitic-group": { 

167 "numbers": ["singular", "dual", "plural"], 

168 "definitenesses": ["indefinite", "definite", "construct"], 

169 }, 

170 "uralic-group": { 

171 "numbers": ["singular", "dual", "plural"], 

172 }, 

173 "german-group": { # languages closely related to or offshot from German 

174 "next": "germanic-group", 

175 "articles_in_separate_columns": True, 

176 }, 

177 "germanic-group": { # Germanic languages as a whole 

178 "next": "indo-european-group", 

179 }, 

180 "Akkadian": { 

181 "next": "semitic-group", 

182 }, 

183 "Alemannic German": { 

184 "next": "German", 

185 }, 

186 "Amharic": { 

187 "next": "semitic-group", 

188 }, 

189 "Ancient Greek": { 

190 "next": "Proto-Indo-European", # Has dual 

191 "remove_text_patterns": { 

192 ("noun", "name"): ( 

193 # Used to remove the gendered article alternatives at the start 

194 # of table entries like ἰχθυοκένταυρος / Ancient Greek 

195 re.compile( 

196 r"(?m)^(ā |ai |hā |hai |hē |ho |ho / hē |ho, hē |hoi |" 

197 r"hoi / hai |hoi, hai |o |oi |tằ |tâ |taì |tâi |" 

198 r"taîs |tân |tān |tān |tâs |tā̀s |têi |tēî |têisĭ |" 

199 r"têisĭ |tḕn |tês |tò |tô |tṑ |tṑ |toi |toì |tôi |" 

200 r"toîn |toîs |toîsĭ |toîsĭ\(n\) |toîsĭn |toîs / taîs |" 

201 r"toîs, taîs |tôi, têi |tōî / tēî |tòn |tôn |" 

202 r"tòn / tḕn |tòn, tḕn |tòs |tṑs |tṑs |toû |toùs |" 

203 r"toùs / tā̀s |toùs, tā̀s |toû / tês |toû, tês )" 

204 ), 

205 # Main greek pattern 

206 re.compile( 

207 r"^(ᾱ |ᾱ̔ |αἰ |αἱ |ἡ |ὀ |ὁ |ὁ / ἡ |ὁ, ἡ |οἰ |οἱ |οἱ / αἱ |" 

208 r"οἱ, αἱ |τᾰ̀ |τᾶ |τᾷ |ταὶ |ταῖς |τᾶν |τᾱν |τᾱν |τᾶς |τᾱ̀ς |" 

209 r"τῇ |τὴν |τῆς |τῇσῐ |τῇσῐν |τὸ |τοι |τοὶ |τοῖ |τοῖν |" 

210 r"τοῖς |" 

211 r"τοῖσῐ / τοῖσῐν |τοῖς / ταῖς |τοῖς, ταῖς |τὸν |τὸν / τὴν |" 

212 r"τὸν, τὴν |τὸς |τοῦ |τοὺς |τοὺς / τᾱ̀ς |τοὺς, τᾱ̀ς |" 

213 r"τοῦ / τῆς |τοῦ, τῆς |τὼ |τῶ |τῷ |τῶν |τὼς |τὼς |" 

214 r"τῷ / τῇ |τῷ, τῇ |τὼ )" 

215 ), 

216 ), 

217 }, 

218 }, 

219 # "Anejom̃": { 

220 # "numbers": ["singular", "dual", "trial", "plural"], 

221 # }, 

222 "Arabic": { 

223 "next": "semitic-group", 

224 "numbers": [ 

225 "singular", 

226 "dual", 

227 "paucal", 

228 "plural", 

229 "collective", 

230 "singulative", 

231 ], 

232 "reuse_cellspan": "reuse", 

233 "hdr_expand_first": set(["number"]), 

234 "hdr_expand_cont": set( 

235 ["gender", "referent", "misc", "number", "class"] 

236 ), 

237 }, 

238 "Aragonese": { 

239 "next": "romance-group", 

240 }, 

241 "Armenian": { 

242 "lang_tag_mappings": { 

243 "noun": { 

244 ("possessive", "singular"): ["possessive", "possessed-single"], 

245 ("possessive", "plural"): ["possessive", "possessed-single"], 

246 }, 

247 }, 

248 }, 

249 "Aromanian": { 

250 "next": "romance-group", 

251 }, 

252 "Aramaic": { 

253 "next": "semitic-group", 

254 }, 

255 "Avestan": { 

256 "next": "Proto-Indo-European", 

257 }, 

258 "Bavarian": { 

259 "next": "German", 

260 }, 

261 "Baiso": { 

262 "numbers": ["singular", "paucal", "plural"], 

263 }, 

264 "Belarusian": { 

265 "next": "slavic-group", 

266 }, 

267 "Bende": { 

268 "next": "bantu-group", 

269 }, 

270 # "Berber": { 

271 # "definitenesses": ["indefinite", "definite", "construct"], 

272 # }, 

273 "Catalan": { 

274 "next": "romance-group", 

275 }, 

276 "Chichewa": { 

277 "next": "bantu-group", 

278 }, 

279 "Chimwiini": { 

280 "next": "bantu-group", 

281 }, 

282 "Cimbrian": { 

283 "next": "German", 

284 }, 

285 "Corsican": { 

286 "next": "romance-group", 

287 }, 

288 "Czech": { 

289 "next": "slavic-group", 

290 "hdr_expand_first": set(["tense", "mood", "non-finite"]), 

291 "hdr_expand_cont": set(["tense", "mood", "voice"]), 

292 }, 

293 "Dalmatian": { 

294 "next": "romance-group", 

295 }, 

296 "Danish": { 

297 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

298 "remove_text_patterns": { 

299 # tuples need the comma to be happy 

300 ("noun",): (re.compile(r"^\(as a measure\) "),), 

301 }, 

302 }, 

303 "Eblaite": { 

304 "next": "semitic-group", 

305 }, 

306 "Egyptian": { 

307 "definitenesses": ["indefinite", "definite", "construct"], 

308 }, 

309 "Emilian": { 

310 "next": "romance-group", 

311 }, 

312 "English": { 

313 "stop_non_finite_tense": True, # affect/English/Verb 

314 "form_transformations": [ 

315 ["verb", r"^\(to\) ", "", ""], 

316 ["verb", "^to ", "", ""], 

317 ["verb", r"^I ", "", "first-person singular"], 

318 ["verb", r"^you ", "", "second-person"], 

319 ["verb", r"^he ", "", "third-person singular"], 

320 ["verb", r"^we ", "", "first-person plural"], 

321 ["verb", r"^they ", "", "third-person"], 

322 ["verb", r"^it ", "", "third-person singular"], 

323 ["verb", r"^thou ", "", "second-person singular"], 

324 ["verb", r"^ye ", "", "second-person plural"], 

325 ["verb", r" \(thou\)$", "", "second-person singular"], 

326 ["verb", r" \(ye\)$", "", "second-person plural"], 

327 ["verb", r"^he/she/it ", "", "third-person singular"], 

328 ["verb", r"^he/she/it/they ", "", "third-person singular"], 

329 ["verb", r"\bhim/her/it/them ", "", "third-person singular"], 

330 ["verb", r"\bthem ", "", "third-person"], 

331 ["verb", r"\bus ", "", "first-person plural"], 

332 ["verb", r"\bme ", "", "first-person singular"], 

333 ], 

334 "form_replacements": { 

335 "let’s be": ["let's be", "first-person plural pronoun-included"], 

336 }, 

337 "special_phrase_splits": { 

338 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"], 

339 "we are (’re)/be/been": [ 

340 ["are (’re)", "be", "been"], 

341 "first-person plural", 

342 ], 

343 "thou art (’rt)/beest": [ 

344 ["art (’rt)", "beest"], 

345 "second-person singular", 

346 ], 

347 "ye are (’re)/be/been": [ 

348 ["are (’re)", "be", "been"], 

349 "second-person plural", 

350 ], 

351 "thou be/beest": [["be", "beest"], "second-person singular"], 

352 "he/she/it is (’s)/beeth/bes": [ 

353 ["is (’s)", "beeth", "bes"], 

354 "third-person singular", 

355 ], 

356 "they are (’re)/be/been": [ 

357 ["are (’re)", "be", "been"], 

358 "third-person plural", 

359 ], 

360 "thou wert/wast": [["wert", "wast"], "second-person singular"], 

361 "thou were/wert": [["were", "wert"], "second-person singular"], 

362 "there has been": [["there has been"], "singular"], 

363 "there have been": [["there have been"], "plural"], 

364 "there is ('s)": [["there is", "there's"], "singular"], 

365 "there are ('re)": [["there are", "there're"], "plural"], 

366 "there was": [["there was"], "singular"], 

367 "there were": [["there were"], "plural"], 

368 }, 

369 }, 

370 "Estonian": { 

371 "hdr_expand_first": set(["non-finite"]), 

372 "hdr_expand_cont": set(["voice"]), 

373 }, 

374 "Faroese": { 

375 "ignore_top_left_text_cell": True, 

376 }, 

377 "Fijian": { 

378 "numbers": ["singular", "paucal", "plural"], 

379 }, 

380 "Finnish": { 

381 "hdr_expand_first": set([]), 

382 }, 

383 "French": { 

384 "next": "romance-group", 

385 }, 

386 "Friulian": { 

387 "next": "romance-group", 

388 }, 

389 "Galician": { 

390 "next": "romance-group", 

391 }, 

392 "German": { 

393 "next": "german-group", 

394 "form_transformations": [ 

395 ["verb", "^ich ", "", "first-person singular"], 

396 ["verb", "^du ", "", "second-person singular"], 

397 ["verb", "^er ", "", "third-person singular"], 

398 ["verb", "^wir ", "", "first-person plural"], 

399 ["verb", "^ihr ", "", "second-person plural"], 

400 ["verb", "^sie ", "", "third-person plural"], 

401 [ 

402 "verb", 

403 "^dass ich ", 

404 "", 

405 "first-person singular subordinate-clause", 

406 ], 

407 [ 

408 "verb", 

409 "^dass du ", 

410 "", 

411 "second-person singular subordinate-clause", 

412 ], 

413 [ 

414 "verb", 

415 "^dass er ", 

416 "", 

417 "third-person singular subordinate-clause", 

418 ], 

419 [ 

420 "verb", 

421 "^dass wir ", 

422 "", 

423 "first-person plural subordinate-clause", 

424 ], 

425 [ 

426 "verb", 

427 "^dass ihr ", 

428 "", 

429 "second-person plural subordinate-clause", 

430 ], 

431 [ 

432 "verb", 

433 "^dass sie ", 

434 "", 

435 "third-person plural subordinate-clause", 

436 ], 

437 ["verb", r" \(du\)$", "", "second-person singular"], 

438 ["verb", r" \(ihr\)$", "", "second-person plural"], 

439 ["adj", "^er ist ", "", "masculine singular"], 

440 ["adj", "^sie ist ", "", "feminine singular"], 

441 ["adj", "^es ist ", "", "neuter singular"], 

442 ["adj", "^sie sind ", "", "plural"], 

443 ["adj", "^keine ", "keine ", "negative"], 

444 ["adj", "^keiner ", "keiner ", "negative"], 

445 ["adj", "^keinen ", "keinen ", "negative"], 

446 ], 

447 "conditionally_ignored_cells": { 

448 "definite": [ 

449 "der", 

450 "die", 

451 "das", 

452 "des", 

453 "dem", 

454 "den", 

455 ], 

456 "indefinite": [ 

457 "ein", 

458 "eine", 

459 "eines", 

460 "einer", 

461 "einem", 

462 "einen", 

463 ], 

464 "negative": [ 

465 "kein", 

466 "keine", 

467 "keiner", 

468 "keinen", 

469 ], 

470 }, 

471 }, 

472 "German Low German": { 

473 "next": "German", 

474 "hdr_expand_first": set(["mood", "non-finite"]), 

475 "hdr_expand_cont": set(["tense"]), 

476 }, 

477 "Gothic": { 

478 "next": "Proto-Indo-European", # Has dual 

479 }, 

480 "Greek": { 

481 "next": "indo-european-group", 

482 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]), 

483 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]), 

484 "imperative_no_tense": True, 

485 "reuse_cellspan": "reuse", 

486 "skip_mood_mood": True, 

487 "skip_tense_tense": True, 

488 # είμαι/Greek 

489 "parentheses_for_informal": True, 

490 "square_brackets_for_rare": True, 

491 "curly_brackets_for_archaic": True, 

492 # For greek originally 

493 "minor_text_cleanups": { 

494 r"\s+➤\s*$": "", 

495 }, 

496 }, 

497 "Hawaiian": { 

498 "next": "austronesian-group", 

499 }, 

500 "Hebrew": { 

501 "next": "semitic-group", 

502 }, 

503 "Hijazi Arabic": { 

504 "next": "semitic-group", 

505 }, 

506 "Hopi": { 

507 "numbers": ["singular", "paucal", "plural"], 

508 }, 

509 "Hungarian": { 

510 "hdr_expand_first": set([]), 

511 "hdr_expand_cont": set([]), 

512 }, 

513 "Hunsrik": { 

514 "next": "German", 

515 }, 

516 "Icelandic": { 

517 "ignore_top_left_text_cell": True, 

518 }, 

519 "Ilokano": { 

520 "next": "austronesian-group", 

521 }, 

522 "Inari Sami": { 

523 "next": "samojedic-group", 

524 }, 

525 "Inuktitut": { 

526 "numbers": ["singular", "dual", "plural"], 

527 }, 

528 "Italian": { 

529 "next": "romance-group", 

530 "hdr_expand_first": set(["mood", "tense"]), 

531 "hdr_expand_cont": set(["person", "register", "number", "misc"]), 

532 "form_transformations": [ 

533 ["verb", "^non ", "", "negative"], 

534 ], 

535 }, 

536 "Irish": { 

537 "next": "Old Irish", 

538 "genders": ["masculine", "feminine"], 

539 }, 

540 "Kamba": { 

541 "next": "bantu-group", 

542 }, 

543 "Kapampangan": { 

544 "next": "austronesian-group", 

545 }, 

546 # "Khoe": { 

547 # "numbers": ["singular", "dual", "plural"], 

548 # }, 

549 "Kikuyu": { 

550 "next": "bantu-group", 

551 }, 

552 "Ladin": { 

553 "next": "romance-group", 

554 }, 

555 # "Larike": { 

556 # "numbers": ["singular", "dual", "trial", "plural"], 

557 # }, 

558 "Latin": { 

559 "next": "romance-group", 

560 "stop_non_finite_voice": True, 

561 }, 

562 "Latvian": { 

563 "empty_row_resets": True, 

564 }, 

565 "Ligurian": { 

566 "next": "romance-group", 

567 }, 

568 "Lihir": { 

569 "numbers": ["singular", "dual", "trial", "paucal", "plural"], 

570 }, 

571 "Lingala": { 

572 "next": "bantu-group", 

573 }, 

574 "Lombard": { 

575 "next": "romance-group", 

576 }, 

577 "Lower Sorbian": { 

578 "next": "slavic-group", 

579 }, 

580 "Luganda": { 

581 "next": "bantu-group", 

582 }, 

583 "Lule Sami": { 

584 "next": "samojedic-group", 

585 }, 

586 "Luxembourgish": { 

587 "next": "German", 

588 }, 

589 "Maltese": { 

590 "next": "semitic-group", 

591 }, 

592 "Maore Comorian": { 

593 "next": "bantu-group", 

594 }, 

595 "Masaba": { 

596 "next": "bantu-group", 

597 }, 

598 "Mirandese": { 

599 "next": "romance-group", 

600 }, 

601 "Moroccan Arabic": { 

602 "next": "semitic-group", 

603 }, 

604 # "Motuna": { 

605 # "numbers": ["singular", "paucal", "plural"], 

606 # }, 

607 "Mwali Comorian": { 

608 "next": "bantu-group", 

609 }, 

610 "Mwani": { 

611 "next": "bantu-group", 

612 }, 

613 "Navajo": { 

614 "numbers": [ 

615 "singular", 

616 "plural", 

617 "dual", 

618 "duoplural", 

619 ], 

620 }, 

621 "Neapolitan": { 

622 "next": "romance-group", 

623 }, 

624 "Nenets": { 

625 "next": "uralic-group", 

626 }, 

627 "Ngazidja Comorian": { 

628 "next": "bantu-group", 

629 }, 

630 "Niuean": { 

631 "next": "austronesian-group", 

632 }, 

633 "Northern Kurdish": { 

634 "numbers": ["singular", "paucal", "plural"], 

635 }, 

636 "Northern Ndebele": { 

637 "next": "bantu-group", 

638 }, 

639 "Northern Sami": { 

640 "next": "samojedic-group", 

641 }, 

642 # "Mussau": { 

643 # "numbers": ["singular", "dual", "trial", "plural"], 

644 # }, 

645 "Nyankole": { 

646 "next": "bantu-group", 

647 }, 

648 "Occitan": { 

649 "next": "romance-group", 

650 }, 

651 "Old Church Slavonic": { 

652 "next": "Proto-Indo-European", # Has dual 

653 }, 

654 "Old English": { 

655 "next": "Proto-Indo-European", # Had dual in pronouns 

656 }, 

657 "Old Norse": { 

658 "next": "Proto-Indo-European", # Had dual in pronouns 

659 }, 

660 "Old Irish": { 

661 "next": "Proto-Indo-European", # Has dual 

662 }, 

663 "Pennsylvania German": { 

664 "next": "German", 

665 }, 

666 "Phoenician": { 

667 "next": "semitic-group", 

668 }, 

669 "Phuthi": { 

670 "next": "bantu-group", 

671 }, 

672 "Pite Sami": { 

673 "next": "samojedic-group", 

674 }, 

675 "Polish": { 

676 "next": "slavic-group", 

677 }, 

678 "Portuguese": { 

679 "next": "romance-group", 

680 "genders": ["masculine", "feminine"], 

681 }, 

682 "Proto-Germanic": { 

683 "next": "Proto-Indo-European", # Has dual 

684 }, 

685 "Proto-Indo-European": { 

686 "numbers": ["singular", "dual", "plural"], 

687 }, 

688 "Proto-Samic": { 

689 "next": "samojedic-group", 

690 }, 

691 "Proto-Uralic": { 

692 "next": "uralic-group", 

693 }, 

694 "Raga": { 

695 "numbers": ["singular", "dual", "trial", "plural"], 

696 }, 

697 "Romagnol": { 

698 "next": "romance-group", 

699 }, 

700 "Romanian": { 

701 "next": "romance-group", 

702 }, 

703 "Romansch": { 

704 "next": "romance-group", 

705 }, 

706 "Russian": { 

707 "next": "slavic-group", 

708 "hdr_expand_first": set(["non-finite", "mood", "tense"]), 

709 "hdr_expand_cont": set(["tense", "number"]), 

710 "reuse_cellspan": "stop", 

711 }, 

712 "Rwanda-Rundi": { 

713 "next": "bantu-group", 

714 }, 

715 "Sanskrit": { 

716 "next": "Proto-Indo-European", 

717 }, 

718 "Sardinian": { 

719 "next": "romance-group", 

720 }, 

721 "Sassarese": { 

722 "next": "romance-group", 

723 }, 

724 "Scottish Gaelic": { 

725 "numbers": ["singular", "dual", "plural"], 

726 }, 

727 "Serbo-Croatian": { 

728 "next": "slavic-group", 

729 "numbers": ["singular", "dual", "paucal", "plural"], 

730 }, 

731 "Sicilian": { 

732 "next": "romance-group", 

733 }, 

734 "Skolt Sami": { 

735 "next": "samojedic-group", 

736 }, 

737 "Slovene": { 

738 "next": "slavic-group", 

739 }, 

740 "Shona": { 

741 "next": "bantu-group", 

742 }, 

743 "Sotho": { 

744 "next": "bantu-group", 

745 }, 

746 "South Levantine Arabic": { 

747 "next": "semitic-group", 

748 }, 

749 "Southern Ndebele": { 

750 "next": "bantu-group", 

751 }, 

752 "Spanish": { 

753 "next": "romance-group", 

754 "form_transformations": [ 

755 ["verb", "^no ", "", "negative"], 

756 ], 

757 "special_references": { 

758 "vos": "informal vos-form second-person singular", 

759 "ᵛᵒˢ": "informal vos-form second-person singular", 

760 "tú": "informal second-person singular", 

761 }, 

762 }, 

763 "Swahili": { 

764 "next": "bantu-group", 

765 }, 

766 "Swedish": { 

767 "hdr_expand_first": set(["referent"]), 

768 "hdr_expand_cont": set(["degree", "polarity"]), 

769 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

770 }, 

771 "Swazi": { 

772 "next": "bantu-group", 

773 }, 

774 # "Syriac": { 

775 # "next": "semitic-group", 

776 # }, 

777 "Tagalog": { 

778 "next": "austronesian-group", 

779 }, 

780 "Tausug": { 

781 "next": "austronesian-group", 

782 }, 

783 "Tigre": { 

784 "next": "semitic-group", 

785 }, 

786 "Tigrinya": { 

787 "next": "semitic-group", 

788 }, 

789 "Tongan": { 

790 "next": "austronesian-group", 

791 }, 

792 "Tsonga": { 

793 "next": "bantu-group", 

794 }, 

795 "Tswana": { 

796 "next": "bantu-group", 

797 }, 

798 "Tumbuka": { 

799 "next": "bantu-group", 

800 }, 

801 # "Tuscan": { 

802 # "next": "romance-group", 

803 # }, 

804 "Ugaritic": { 

805 "next": "semitic-group", 

806 }, 

807 "Ukrainian": { 

808 "next": "slavic-group", 

809 }, 

810 "Upper Sorbian": { 

811 "next": "slavic-group", 

812 }, 

813 # "Valencian": { 

814 # "next": "romance-group", 

815 # }, 

816 "Venetian": { 

817 "next": "romance-group", 

818 }, 

819 "Warlpiri": { 

820 "numbers": ["singular", "paucal", "plural"], 

821 }, 

822 "Xhosa": { 

823 "next": "bantu-group", 

824 }, 

825 "Zulu": { 

826 "next": "bantu-group", 

827 }, 

828 "ǃXóõ": { 

829 "next": "bantu-group", 

830 }, 

831} 

832 

833 

834# Sanity check lang_specific 

835# def_ls_keys = lang_specific["default"].keys() 

836# for k, v in lang_specific.items(): 

837# if k[0].isupper() and k not in languages_by_name: 

838# raise AssertionError( 

839# "key {!r} in lang_specific is not a valid language" 

840# .format(k)) 

841# assert isinstance(v, dict) 

842# for kk, vv in v.items(): 

843# if kk not in def_ls_keys and kk != "next": 

844# raise AssertionError("{} key {!r} not in default entry" 

845# .format(k, kk)) 

846# if kk in ("hdr_expand_first", "hdr_expand_cont"): 

847# if not isinstance(vv, set): 

848# raise AssertionError("{} key {!r} must be set" 

849# .format(lang, kk)) 

850# for t in vv: 

851# if t not in tag_categories: 

852# raise AssertionError("{} key {!r} invalid tag category {}" 

853# .format(k, kk, t)) 

854# elif kk in ("genders", "numbers", "persons", "strengths", "voices"): 

855# if not vv: 

856# continue 

857# if not isinstance(vv, (list, tuple, set)): 

858# raise AssertionError("{} key {!r} must be list/tuple/set" 

859# .format(k, kk)) 

860# for t in vv: 

861# if t not in valid_tags: 

862# raise AssertionError("{} key {!r} invalid tag {!r}" 

863# .format(k, kk, t)) 

864# elif kk == "lang_tag_mappings" and vv is not None: 

865# for pos, transf in vv.items(): 

866# assert pos in PARTS_OF_SPEECH 

867# assert isinstance(transf, dict) 

868# for pre, post in transf.items(): 

869# assert isinstance(pre, tuple) 

870# assert all(t in valid_tags for t in pre) 

871# assert isinstance(post, list) 

872# assert all(t in valid_tags for t in post) 

873# elif kk == "next": 

874# if vv not in lang_specific: 

875# raise AssertionError("{} key {!r} value {!r} is not defined" 

876# .format(k, kk, vv)) 

877 

878 

879def get_lang_conf(lang, field): 

880 """Returns the given field from language-specific data or "default" 

881 if the language is not listed or does not have the field.""" 

882 assert isinstance(lang, str) 

883 assert isinstance(field, str) 

884 while True: 

885 lconfigs = lang_specific.get(lang) 

886 if lconfigs is None: 

887 lang = "default" 

888 elif lang == "default" and field not in lconfigs: 888 ↛ 889line 888 didn't jump to line 889 because the condition on line 888 was never true

889 raise RuntimeError("Invalid lang_specific field {!r}".format(field)) 

890 else: 

891 if field in lconfigs: 

892 return lconfigs[field] 

893 lang = lconfigs.get("next", "default") 

894 

895 

896def lang_specific_tags(lang, pos, form): 

897 """Extracts tags from the word form itself in a language-specific way. 

898 This may also adjust the word form. 

899 For example, German inflected verb forms don't have person and number 

900 specified in the table, but include a pronoun. This returns adjusted 

901 form and a list of tags.""" 

902 assert isinstance(lang, str) 

903 assert isinstance(pos, str) 

904 assert isinstance(form, str) 

905 rules = get_lang_conf(lang, "form_transformations") 

906 for patpos, pattern, dst, tags in rules: 

907 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > "" 

908 if isinstance(patpos, tuple): 908 ↛ 909line 908 didn't jump to line 909 because the condition on line 908 was never true

909 for p in patpos: 

910 assert p in PARTS_OF_SPEECH 

911 if pos not in patpos: 

912 continue 

913 else: 

914 assert patpos in PARTS_OF_SPEECH 

915 if pos != patpos: 

916 continue 

917 m = re.search(pattern, form) 

918 if not m: 

919 continue 

920 form = form[: m.start()] + dst + form[m.end() :] 

921 tags = tags.split() 

922 for t in tags: 

923 assert t in valid_tags 

924 return form, tags 

925 return form, []