Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 96%

36 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1# Language-specific configuration for various aspects of inflection table 

2# parsing. 

3 

4import re 

5from typing import Optional, TypedDict, Union 

6 

7from ...tags import valid_tags 

8from .parts_of_speech import PARTS_OF_SPEECH 

9 

10LangConfDict = TypedDict( 

11 "LangConfDict", 

12 { 

13 "next": str, 

14 "hdr_expand_first": set[str], 

15 "hdr_expand_cont": set[str], 

16 "animate_inanimate_remove": bool, 

17 "both_active_passive_remove": bool, 

18 "both_strong_weak_remove": bool, 

19 "definitenesses": list[str], 

20 "empty_row_resets": bool, 

21 "form_transformations": list[ 

22 list[str] 

23 ], # tag extraction, lang_specific_tags() 

24 "genders": Optional[list[str]], 

25 "imperative_no_tense": bool, 

26 "masc_only_animate": bool, # Slavic special 

27 "numbers": list[str], 

28 "persons": list[str], 

29 "pl_virile_nonvirile": bool, 

30 "reuse_cellspan": str, # stop/skip/reuse 

31 "skip_mood_mood": bool, 

32 "skip_tense_tense": bool, 

33 "stop_non_finite_non_finite": bool, 

34 "stop_non_finite_voice": bool, 

35 "stop_non_finite_tense": bool, 

36 "strengths": list[str], 

37 "virile_nonvirile_remove": bool, 

38 "voices": list[str], 

39 "special_phrase_splits": dict[ 

40 str, list[Union[list[str], str]] 

41 ], # value: (split phrase, tags) 

42 "form_replacements": dict[ 

43 str, Union[str, list[str]] 

44 ], # value: [replacement, tags] 

45 # Greek-style bracket semantics 

46 "parentheses_for_informal": bool, 

47 "square_brackets_for_rare": bool, 

48 "curly_brackets_for_archaic": bool, 

49 # Armenian; migrated old data here 

50 "lang_tag_mappings": Optional[ 

51 dict[str, dict[tuple[str, ...], list[str]]] 

52 ], 

53 # Spanish has a lot of "vos" and "tú" in its tables that look like 

54 # references, and they give their form certain tags. 

55 # Dict of references ("vos") that point to tag strings "first-person 

56 # singular" that *extend* tags. 

57 "special_references": Optional[dict[str, str]], 

58 # Some languages like Icelandic and Faroese have text cells in the 

59 # upper left that we'd like to ignore. 

60 "ignore_top_left_text_cell": bool, 

61 # Minor regex replacements for cleanup in parse_simple_table() 

62 "minor_text_cleanups": Optional[ 

63 dict[str, str] 

64 ], # dict of {regex: substitution} 

65 "articles_in_separate_columns": bool, 

66 # Cells to ignore in this language, unless the cell has the key 

67 # as a tag. 

68 "conditionally_ignored_cells": dict[str, list[str]], 

69 }, 

70 total=False, 

71) 

72 

73lang_specific: dict[str, LangConfDict] = { 

74 "default": { 

75 "hdr_expand_first": set( 

76 [ 

77 "number", 

78 "mood", 

79 "referent", 

80 "aspect", 

81 "tense", 

82 "voice", 

83 "non-finite", 

84 "case", 

85 "possession", 

86 ] 

87 ), 

88 "hdr_expand_cont": set( 

89 [ 

90 "person", 

91 "gender", 

92 "number", 

93 "degree", 

94 "polarity", 

95 "voice", 

96 "misc", 

97 ] 

98 ), 

99 "animate_inanimate_remove": True, 

100 "both_active_passive_remove": True, 

101 "both_strong_weak_remove": True, 

102 "definitenesses": ["indefinite", "definite"], 

103 "empty_row_resets": False, 

104 "form_transformations": [], # tag extraction, lang_specific_tags() 

105 "genders": None, 

106 "imperative_no_tense": False, 

107 "masc_only_animate": False, # Slavic special 

108 "numbers": ["singular", "plural"], 

109 "persons": ["first-person", "second-person", "third-person"], 

110 "pl_virile_nonvirile": False, 

111 "reuse_cellspan": "skip", # stop/skip/reuse 

112 "skip_mood_mood": False, 

113 "skip_tense_tense": False, 

114 "stop_non_finite_non_finite": True, 

115 "stop_non_finite_voice": False, 

116 "stop_non_finite_tense": False, 

117 "strengths": ["strong", "weak"], 

118 "virile_nonvirile_remove": True, 

119 "voices": ["active", "passive"], 

120 "special_phrase_splits": {}, # value: (split phrase, tags) 

121 "form_replacements": {}, # value: [replacement, tags] 

122 # Greek-style bracket semantics 

123 "parentheses_for_informal": False, 

124 "square_brackets_for_rare": False, 

125 "curly_brackets_for_archaic": False, 

126 # Armenian; migrated old data here 

127 "lang_tag_mappings": None, 

128 # Spanish has a lot of "vos" and "tú" in its tables that look like 

129 # references, and they give their form certain tags. 

130 # Dict of references ("vos") that point to tag strings "first-person 

131 # singular" that *extend* tags. 

132 "special_references": None, 

133 # Some languages like Icelandic and Faroese have text cells in the 

134 # upper left that we'd like to ignore. 

135 "ignore_top_left_text_cell": False, 

136 # Minor regex replacements for cleanup in parse_simple_table() 

137 "minor_text_cleanups": None, # dict of {regex: substitution} 

138 "articles_in_separate_columns": False, 

139 # Cells to ignore in this language, unless the cell has the key 

140 # as a tag. 

141 "conditionally_ignored_cells": {}, 

142 }, 

143 "austronesian-group": { 

144 "numbers": ["singular", "dual", "plural"], 

145 }, 

146 "bantu-group": { 

147 "genders": None, 

148 }, 

149 "indo-european-group": { 

150 "genders": ["masculine", "feminine", "neuter"], 

151 "numbers": ["singular", "plural"], 

152 }, 

153 "romance-group": {}, 

154 "slavic-group": { 

155 "numbers": ["singular", "plural", "dual"], 

156 "masc_only_animate": True, 

157 }, 

158 "samojedic-group": { 

159 "next": "uralic-group", 

160 }, 

161 "semitic-group": { 

162 "numbers": ["singular", "dual", "plural"], 

163 "definitenesses": ["indefinite", "definite", "construct"], 

164 }, 

165 "uralic-group": { 

166 "numbers": ["singular", "dual", "plural"], 

167 }, 

168 "german-group": { # languages closely related to or offshot from German 

169 "next": "germanic-group", 

170 "articles_in_separate_columns": True, 

171 }, 

172 "germanic-group": { # Germanic languages as a whole 

173 "next": "indo-european-group", 

174 }, 

175 "Akkadian": { 

176 "next": "semitic-group", 

177 }, 

178 "Alemannic German": { 

179 "next": "German", 

180 }, 

181 "Amharic": { 

182 "next": "semitic-group", 

183 }, 

184 "Ancient Greek": { 

185 "next": "Proto-Indo-European", # Has dual 

186 "form_transformations": [ 

187 # Used to remove the gendered article alternatives at the start 

188 # of table entries like ἰχθυοκένταυρος / Ancient Greek 

189 ["noun", "^ὁ, ἡ ", "", ""], 

190 ["noun", "^τὼ ", "", ""], 

191 ["noun", "^οἱ, αἱ ", "", ""], 

192 ["noun", "^τοῦ, τῆς ", "", ""], 

193 ["noun", "^τοῖν ", "", ""], 

194 ["noun", "^τῶν ", "", ""], 

195 ["noun", "^τῷ, τῇ ", "", ""], 

196 ["noun", "^τοῖς, ταῖς ", "", ""], 

197 ["noun", "^τὸν, τὴν ", "", ""], 

198 ["noun", "^τὼ ", "", ""], 

199 ["noun", "^τοὺς, τᾱ̀ς ", "", ""], 

200 ["noun", "(?m)^ho, hē ", "", ""], 

201 ["noun", "(?m)^tṑ ", "", ""], 

202 ["noun", "(?m)^hoi, hai ", "", ""], 

203 ["noun", "(?m)^toû, tês ", "", ""], 

204 ["noun", "(?m)^toîn ", "", ""], 

205 ["noun", "(?m)^tôn ", "", ""], 

206 ["noun", "(?m)^tôi, têi ", "", ""], 

207 ["noun", "(?m)^toîs, taîs ", "", ""], 

208 ["noun", "(?m)^tòn, tḕn ", "", ""], 

209 ["noun", "(?m)^tṑ ", "", ""], 

210 ["noun", "(?m)^toùs, tā̀s ", "", ""], 

211 # New added ones, leaving the old ones just in case 

212 ["noun", r"^ὁ ", "", ""], 

213 ["noun", r"^ἡ ", "", ""], 

214 ["noun", r"(?m)^hē ", "", ""], 

215 ["noun", r"^αἱ ", "", ""], 

216 ["noun", r"(?m)^hai ", "", ""], 

217 ["noun", r"^τῆς ", "", ""], 

218 ["noun", r"(?m)^tês ", "", ""], 

219 ["noun", r"^τῇ ", "", ""], 

220 ["noun", r"(?m)^tēî ", "", ""], 

221 ["noun", r"^ταῖς ", "", ""], 

222 ["noun", r"(?m)^taîs ", "", ""], 

223 ["noun", r"^τὴν ", "", ""], 

224 ["noun", r"(?m)^tḕn ", "", ""], 

225 ["noun", r"^τᾱ̀ς ", "", ""], 

226 ["noun", r"(?m)^tā̀s ", "", ""], 

227 ["noun", r"^ὁ / ἡ ", "", ""], 

228 ["noun", r"(?m)^ho / hē ", "", ""], 

229 ["noun", r"^οἱ / αἱ ", "", ""], 

230 ["noun", r"(?m)^hoi / hai ", "", ""], 

231 ["noun", r"^τοῦ / τῆς ", "", ""], 

232 ["noun", r"(?m)^toû / tês ", "", ""], 

233 ["noun", r"^τῷ / τῇ ", "", ""], 

234 ["noun", r"(?m)^tōî / tēî ", "", ""], 

235 ["noun", r"^τοῖς / ταῖς ", "", ""], 

236 ["noun", r"(?m)^toîs / taîs ", "", ""], 

237 ["noun", r"^τὸν / τὴν ", "", ""], 

238 ["noun", r"(?m)^tòn / tḕn ", "", ""], 

239 ["noun", r"^τοὺς / τᾱ̀ς ", "", ""], 

240 ["noun", r"(?m)^toùs / tā̀s ", "", ""], 

241 ["noun", r"^οἱ ", "", ""], 

242 ["noun", r"(?m)^hoi ", "", ""], 

243 ["noun", r"^τοῦ ", "", ""], 

244 ["noun", r"(?m)^toû ", "", ""], 

245 ["noun", r"^τῷ ", "", ""], 

246 ["noun", r"(?m)^tôi ", "", ""], 

247 ["noun", r"^τοῖς ", "", ""], 

248 ["noun", r"(?m)^toîs ", "", ""], 

249 ["noun", r"^τὸν ", "", ""], 

250 ["noun", r"(?m)^τὸν ", "", ""], 

251 ["noun", r"^τοὺς ", "", ""], 

252 ["noun", r"(?m)^toùs ", "", ""], 

253 ["noun", r"^τὸ ", "", ""], 

254 ["noun", r"(?m)^tò ", "", ""], 

255 ["noun", r"^τᾰ̀ ", "", ""], 

256 ["noun", r"(?m)^tằ ", "", ""], 

257 ["noun", r"^τοῖσῐ / τοῖσῐν ", "", ""], 

258 # XXX THIS IS BAD, IF POSSIBLE FIX, ISSUE #1313 

259 ["noun", r"(?m)^toîsĭ\(n\) ", "", ""], 

260 ["noun", r"(?m)^toîsĭ ", "", ""], 

261 ["noun", r"(?m)^toîsĭn ", "", ""], 

262 # END BAD 

263 # ["noun", r"^", "", ""], 

264 # ["noun", r"(?m)^", "", ""], 

265 ], 

266 }, 

267 # "Anejom̃": { 

268 # "numbers": ["singular", "dual", "trial", "plural"], 

269 # }, 

270 "Arabic": { 

271 "next": "semitic-group", 

272 "numbers": [ 

273 "singular", 

274 "dual", 

275 "paucal", 

276 "plural", 

277 "collective", 

278 "singulative", 

279 ], 

280 "reuse_cellspan": "reuse", 

281 "hdr_expand_first": set(["number"]), 

282 "hdr_expand_cont": set( 

283 ["gender", "referent", "misc", "number", "class"] 

284 ), 

285 }, 

286 "Aragonese": { 

287 "next": "romance-group", 

288 }, 

289 "Armenian": { 

290 "lang_tag_mappings": { 

291 "noun": { 

292 ("possessive", "singular"): ["possessive", "possessed-single"], 

293 ("possessive", "plural"): ["possessive", "possessed-single"], 

294 }, 

295 }, 

296 }, 

297 "Aromanian": { 

298 "next": "romance-group", 

299 }, 

300 "Aramaic": { 

301 "next": "semitic-group", 

302 }, 

303 "Avestan": { 

304 "next": "Proto-Indo-European", 

305 }, 

306 "Bavarian": { 

307 "next": "German", 

308 }, 

309 "Baiso": { 

310 "numbers": ["singular", "paucal", "plural"], 

311 }, 

312 "Belarusian": { 

313 "next": "slavic-group", 

314 }, 

315 "Bende": { 

316 "next": "bantu-group", 

317 }, 

318 # "Berber": { 

319 # "definitenesses": ["indefinite", "definite", "construct"], 

320 # }, 

321 "Catalan": { 

322 "next": "romance-group", 

323 }, 

324 "Chichewa": { 

325 "next": "bantu-group", 

326 }, 

327 "Chimwiini": { 

328 "next": "bantu-group", 

329 }, 

330 "Cimbrian": { 

331 "next": "German", 

332 }, 

333 "Corsican": { 

334 "next": "romance-group", 

335 }, 

336 "Czech": { 

337 "next": "slavic-group", 

338 "hdr_expand_first": set(["tense", "mood", "non-finite"]), 

339 "hdr_expand_cont": set(["tense", "mood", "voice"]), 

340 }, 

341 "Dalmatian": { 

342 "next": "romance-group", 

343 }, 

344 "Danish": { 

345 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

346 "form_transformations": [ 

347 ["noun", r"^\(as a measure\) ", "", ""], 

348 ], 

349 }, 

350 "Eblaite": { 

351 "next": "semitic-group", 

352 }, 

353 "Egyptian": { 

354 "definitenesses": ["indefinite", "definite", "construct"], 

355 }, 

356 "Emilian": { 

357 "next": "romance-group", 

358 }, 

359 "English": { 

360 "stop_non_finite_tense": True, # affect/English/Verb 

361 "form_transformations": [ 

362 ["verb", r"^\(to\) ", "", ""], 

363 ["verb", "^to ", "", ""], 

364 ["verb", r"^I ", "", "first-person singular"], 

365 ["verb", r"^you ", "", "second-person"], 

366 ["verb", r"^he ", "", "third-person singular"], 

367 ["verb", r"^we ", "", "first-person plural"], 

368 ["verb", r"^they ", "", "third-person"], 

369 ["verb", r"^it ", "", "third-person singular"], 

370 ["verb", r"^thou ", "", "second-person singular"], 

371 ["verb", r"^ye ", "", "second-person plural"], 

372 ["verb", r" \(thou\)$", "", "second-person singular"], 

373 ["verb", r" \(ye\)$", "", "second-person plural"], 

374 ["verb", r"^he/she/it ", "", "third-person singular"], 

375 ["verb", r"^he/she/it/they ", "", "third-person singular"], 

376 ["verb", r"\bhim/her/it/them ", "", "third-person singular"], 

377 ["verb", r"\bthem ", "", "third-person"], 

378 ["verb", r"\bus ", "", "first-person plural"], 

379 ["verb", r"\bme ", "", "first-person singular"], 

380 ], 

381 "form_replacements": { 

382 "let’s be": ["let's be", "first-person plural pronoun-included"], 

383 }, 

384 "special_phrase_splits": { 

385 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"], 

386 "we are (’re)/be/been": [ 

387 ["are (’re)", "be", "been"], 

388 "first-person plural", 

389 ], 

390 "thou art (’rt)/beest": [ 

391 ["art (’rt)", "beest"], 

392 "second-person singular", 

393 ], 

394 "ye are (’re)/be/been": [ 

395 ["are (’re)", "be", "been"], 

396 "second-person plural", 

397 ], 

398 "thou be/beest": [["be", "beest"], "second-person singular"], 

399 "he/she/it is (’s)/beeth/bes": [ 

400 ["is (’s)", "beeth", "bes"], 

401 "third-person singular", 

402 ], 

403 "they are (’re)/be/been": [ 

404 ["are (’re)", "be", "been"], 

405 "third-person plural", 

406 ], 

407 "thou wert/wast": [["wert", "wast"], "second-person singular"], 

408 "thou were/wert": [["were", "wert"], "second-person singular"], 

409 "there has been": [["there has been"], "singular"], 

410 "there have been": [["there have been"], "plural"], 

411 "there is ('s)": [["there is", "there's"], "singular"], 

412 "there are ('re)": [["there are", "there're"], "plural"], 

413 "there was": [["there was"], "singular"], 

414 "there were": [["there were"], "plural"], 

415 }, 

416 }, 

417 "Estonian": { 

418 "hdr_expand_first": set(["non-finite"]), 

419 "hdr_expand_cont": set(["voice"]), 

420 }, 

421 "Faroese": { 

422 "ignore_top_left_text_cell": True, 

423 }, 

424 "Fijian": { 

425 "numbers": ["singular", "paucal", "plural"], 

426 }, 

427 "Finnish": { 

428 "hdr_expand_first": set([]), 

429 }, 

430 "French": { 

431 "next": "romance-group", 

432 }, 

433 "Friulian": { 

434 "next": "romance-group", 

435 }, 

436 "Galician": { 

437 "next": "romance-group", 

438 }, 

439 "German": { 

440 "next": "german-group", 

441 "form_transformations": [ 

442 ["verb", "^ich ", "", "first-person singular"], 

443 ["verb", "^du ", "", "second-person singular"], 

444 ["verb", "^er ", "", "third-person singular"], 

445 ["verb", "^wir ", "", "first-person plural"], 

446 ["verb", "^ihr ", "", "second-person plural"], 

447 ["verb", "^sie ", "", "third-person plural"], 

448 [ 

449 "verb", 

450 "^dass ich ", 

451 "", 

452 "first-person singular subordinate-clause", 

453 ], 

454 [ 

455 "verb", 

456 "^dass du ", 

457 "", 

458 "second-person singular subordinate-clause", 

459 ], 

460 [ 

461 "verb", 

462 "^dass er ", 

463 "", 

464 "third-person singular subordinate-clause", 

465 ], 

466 [ 

467 "verb", 

468 "^dass wir ", 

469 "", 

470 "first-person plural subordinate-clause", 

471 ], 

472 [ 

473 "verb", 

474 "^dass ihr ", 

475 "", 

476 "second-person plural subordinate-clause", 

477 ], 

478 [ 

479 "verb", 

480 "^dass sie ", 

481 "", 

482 "third-person plural subordinate-clause", 

483 ], 

484 ["verb", r" \(du\)$", "", "second-person singular"], 

485 ["verb", r" \(ihr\)$", "", "second-person plural"], 

486 ["adj", "^er ist ", "", "masculine singular"], 

487 ["adj", "^sie ist ", "", "feminine singular"], 

488 ["adj", "^es ist ", "", "neuter singular"], 

489 ["adj", "^sie sind ", "", "plural"], 

490 ["adj", "^keine ", "keine ", "negative"], 

491 ["adj", "^keiner ", "keiner ", "negative"], 

492 ["adj", "^keinen ", "keinen ", "negative"], 

493 ], 

494 "conditionally_ignored_cells": { 

495 "definite": [ 

496 "der", 

497 "die", 

498 "das", 

499 "des", 

500 "dem", 

501 "den", 

502 ], 

503 "indefinite": [ 

504 "ein", 

505 "eine", 

506 "eines", 

507 "einer", 

508 "einem", 

509 "einen", 

510 ], 

511 "negative": [ 

512 "kein", 

513 "keine", 

514 "keiner", 

515 "keinen", 

516 ], 

517 }, 

518 }, 

519 "German Low German": { 

520 "next": "German", 

521 "hdr_expand_first": set(["mood", "non-finite"]), 

522 "hdr_expand_cont": set(["tense"]), 

523 }, 

524 "Gothic": { 

525 "next": "Proto-Indo-European", # Has dual 

526 }, 

527 "Greek": { 

528 "next": "indo-european-group", 

529 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]), 

530 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]), 

531 "imperative_no_tense": True, 

532 "reuse_cellspan": "reuse", 

533 "skip_mood_mood": True, 

534 "skip_tense_tense": True, 

535 # είμαι/Greek 

536 "parentheses_for_informal": True, 

537 "square_brackets_for_rare": True, 

538 "curly_brackets_for_archaic": True, 

539 # For greek originally 

540 "minor_text_cleanups": { 

541 r"\s+➤\s*$": "", 

542 }, 

543 }, 

544 "Hawaiian": { 

545 "next": "austronesian-group", 

546 }, 

547 "Hebrew": { 

548 "next": "semitic-group", 

549 }, 

550 "Hijazi Arabic": { 

551 "next": "semitic-group", 

552 }, 

553 "Hopi": { 

554 "numbers": ["singular", "paucal", "plural"], 

555 }, 

556 "Hungarian": { 

557 "hdr_expand_first": set([]), 

558 "hdr_expand_cont": set([]), 

559 }, 

560 "Hunsrik": { 

561 "next": "German", 

562 }, 

563 "Icelandic": { 

564 "ignore_top_left_text_cell": True, 

565 }, 

566 "Ilokano": { 

567 "next": "austronesian-group", 

568 }, 

569 "Inari Sami": { 

570 "next": "samojedic-group", 

571 }, 

572 "Inuktitut": { 

573 "numbers": ["singular", "dual", "plural"], 

574 }, 

575 "Italian": { 

576 "next": "romance-group", 

577 "hdr_expand_first": set(["mood", "tense"]), 

578 "hdr_expand_cont": set(["person", "register", "number", "misc"]), 

579 "form_transformations": [ 

580 ["verb", "^non ", "", "negative"], 

581 ], 

582 }, 

583 "Irish": { 

584 "next": "Old Irish", 

585 "genders": ["masculine", "feminine"], 

586 }, 

587 "Kamba": { 

588 "next": "bantu-group", 

589 }, 

590 "Kapampangan": { 

591 "next": "austronesian-group", 

592 }, 

593 # "Khoe": { 

594 # "numbers": ["singular", "dual", "plural"], 

595 # }, 

596 "Kikuyu": { 

597 "next": "bantu-group", 

598 }, 

599 "Ladin": { 

600 "next": "romance-group", 

601 }, 

602 # "Larike": { 

603 # "numbers": ["singular", "dual", "trial", "plural"], 

604 # }, 

605 "Latin": { 

606 "next": "romance-group", 

607 "stop_non_finite_voice": True, 

608 }, 

609 "Latvian": { 

610 "empty_row_resets": True, 

611 }, 

612 "Ligurian": { 

613 "next": "romance-group", 

614 }, 

615 "Lihir": { 

616 "numbers": ["singular", "dual", "trial", "paucal", "plural"], 

617 }, 

618 "Lingala": { 

619 "next": "bantu-group", 

620 }, 

621 "Lombard": { 

622 "next": "romance-group", 

623 }, 

624 "Lower Sorbian": { 

625 "next": "slavic-group", 

626 }, 

627 "Luganda": { 

628 "next": "bantu-group", 

629 }, 

630 "Lule Sami": { 

631 "next": "samojedic-group", 

632 }, 

633 "Luxembourgish": { 

634 "next": "German", 

635 }, 

636 "Maltese": { 

637 "next": "semitic-group", 

638 }, 

639 "Maore Comorian": { 

640 "next": "bantu-group", 

641 }, 

642 "Masaba": { 

643 "next": "bantu-group", 

644 }, 

645 "Mirandese": { 

646 "next": "romance-group", 

647 }, 

648 "Moroccan Arabic": { 

649 "next": "semitic-group", 

650 }, 

651 # "Motuna": { 

652 # "numbers": ["singular", "paucal", "plural"], 

653 # }, 

654 "Mwali Comorian": { 

655 "next": "bantu-group", 

656 }, 

657 "Mwani": { 

658 "next": "bantu-group", 

659 }, 

660 "Navajo": { 

661 "numbers": [ 

662 "singular", 

663 "plural", 

664 "dual", 

665 "duoplural", 

666 ], 

667 }, 

668 "Neapolitan": { 

669 "next": "romance-group", 

670 }, 

671 "Nenets": { 

672 "next": "uralic-group", 

673 }, 

674 "Ngazidja Comorian": { 

675 "next": "bantu-group", 

676 }, 

677 "Niuean": { 

678 "next": "austronesian-group", 

679 }, 

680 "Northern Kurdish": { 

681 "numbers": ["singular", "paucal", "plural"], 

682 }, 

683 "Northern Ndebele": { 

684 "next": "bantu-group", 

685 }, 

686 "Northern Sami": { 

687 "next": "samojedic-group", 

688 }, 

689 # "Mussau": { 

690 # "numbers": ["singular", "dual", "trial", "plural"], 

691 # }, 

692 "Nyankole": { 

693 "next": "bantu-group", 

694 }, 

695 "Occitan": { 

696 "next": "romance-group", 

697 }, 

698 "Old Church Slavonic": { 

699 "next": "Proto-Indo-European", # Has dual 

700 }, 

701 "Old English": { 

702 "next": "Proto-Indo-European", # Had dual in pronouns 

703 }, 

704 "Old Norse": { 

705 "next": "Proto-Indo-European", # Had dual in pronouns 

706 }, 

707 "Old Irish": { 

708 "next": "Proto-Indo-European", # Has dual 

709 }, 

710 "Pennsylvania German": { 

711 "next": "German", 

712 }, 

713 "Phoenician": { 

714 "next": "semitic-group", 

715 }, 

716 "Phuthi": { 

717 "next": "bantu-group", 

718 }, 

719 "Pite Sami": { 

720 "next": "samojedic-group", 

721 }, 

722 "Polish": { 

723 "next": "slavic-group", 

724 }, 

725 "Portuguese": { 

726 "next": "romance-group", 

727 "genders": ["masculine", "feminine"], 

728 }, 

729 "Proto-Germanic": { 

730 "next": "Proto-Indo-European", # Has dual 

731 }, 

732 "Proto-Indo-European": { 

733 "numbers": ["singular", "dual", "plural"], 

734 }, 

735 "Proto-Samic": { 

736 "next": "samojedic-group", 

737 }, 

738 "Proto-Uralic": { 

739 "next": "uralic-group", 

740 }, 

741 "Raga": { 

742 "numbers": ["singular", "dual", "trial", "plural"], 

743 }, 

744 "Romagnol": { 

745 "next": "romance-group", 

746 }, 

747 "Romanian": { 

748 "next": "romance-group", 

749 }, 

750 "Romansch": { 

751 "next": "romance-group", 

752 }, 

753 "Russian": { 

754 "next": "slavic-group", 

755 "hdr_expand_first": set(["non-finite", "mood", "tense"]), 

756 "hdr_expand_cont": set(["tense", "number"]), 

757 "reuse_cellspan": "stop", 

758 }, 

759 "Rwanda-Rundi": { 

760 "next": "bantu-group", 

761 }, 

762 "Sanskrit": { 

763 "next": "Proto-Indo-European", 

764 }, 

765 "Sardinian": { 

766 "next": "romance-group", 

767 }, 

768 "Sassarese": { 

769 "next": "romance-group", 

770 }, 

771 "Scottish Gaelic": { 

772 "numbers": ["singular", "dual", "plural"], 

773 }, 

774 "Serbo-Croatian": { 

775 "next": "slavic-group", 

776 "numbers": ["singular", "dual", "paucal", "plural"], 

777 }, 

778 "Sicilian": { 

779 "next": "romance-group", 

780 }, 

781 "Skolt Sami": { 

782 "next": "samojedic-group", 

783 }, 

784 "Slovene": { 

785 "next": "slavic-group", 

786 }, 

787 "Shona": { 

788 "next": "bantu-group", 

789 }, 

790 "Sotho": { 

791 "next": "bantu-group", 

792 }, 

793 "South Levantine Arabic": { 

794 "next": "semitic-group", 

795 }, 

796 "Southern Ndebele": { 

797 "next": "bantu-group", 

798 }, 

799 "Spanish": { 

800 "next": "romance-group", 

801 "form_transformations": [ 

802 ["verb", "^no ", "", "negative"], 

803 ], 

804 "special_references": { 

805 "vos": "informal vos-form second-person singular", 

806 "ᵛᵒˢ": "informal vos-form second-person singular", 

807 "tú": "informal second-person singular", 

808 }, 

809 }, 

810 "Swahili": { 

811 "next": "bantu-group", 

812 }, 

813 "Swedish": { 

814 "hdr_expand_first": set(["referent"]), 

815 "hdr_expand_cont": set(["degree", "polarity"]), 

816 "genders": ["common-gender", "feminine", "masculine", "neuter"], 

817 }, 

818 "Swazi": { 

819 "next": "bantu-group", 

820 }, 

821 # "Syriac": { 

822 # "next": "semitic-group", 

823 # }, 

824 "Tagalog": { 

825 "next": "austronesian-group", 

826 }, 

827 "Tausug": { 

828 "next": "austronesian-group", 

829 }, 

830 "Tigre": { 

831 "next": "semitic-group", 

832 }, 

833 "Tigrinya": { 

834 "next": "semitic-group", 

835 }, 

836 "Tongan": { 

837 "next": "austronesian-group", 

838 }, 

839 "Tsonga": { 

840 "next": "bantu-group", 

841 }, 

842 "Tswana": { 

843 "next": "bantu-group", 

844 }, 

845 "Tumbuka": { 

846 "next": "bantu-group", 

847 }, 

848 # "Tuscan": { 

849 # "next": "romance-group", 

850 # }, 

851 "Ugaritic": { 

852 "next": "semitic-group", 

853 }, 

854 "Ukrainian": { 

855 "next": "slavic-group", 

856 }, 

857 "Upper Sorbian": { 

858 "next": "slavic-group", 

859 }, 

860 # "Valencian": { 

861 # "next": "romance-group", 

862 # }, 

863 "Venetian": { 

864 "next": "romance-group", 

865 }, 

866 "Warlpiri": { 

867 "numbers": ["singular", "paucal", "plural"], 

868 }, 

869 "Xhosa": { 

870 "next": "bantu-group", 

871 }, 

872 "Zulu": { 

873 "next": "bantu-group", 

874 }, 

875 "ǃXóõ": { 

876 "next": "bantu-group", 

877 }, 

878} 

879 

880 

881# Sanity check lang_specific 

882# def_ls_keys = lang_specific["default"].keys() 

883# for k, v in lang_specific.items(): 

884# if k[0].isupper() and k not in languages_by_name: 

885# raise AssertionError( 

886# "key {!r} in lang_specific is not a valid language" 

887# .format(k)) 

888# assert isinstance(v, dict) 

889# for kk, vv in v.items(): 

890# if kk not in def_ls_keys and kk != "next": 

891# raise AssertionError("{} key {!r} not in default entry" 

892# .format(k, kk)) 

893# if kk in ("hdr_expand_first", "hdr_expand_cont"): 

894# if not isinstance(vv, set): 

895# raise AssertionError("{} key {!r} must be set" 

896# .format(lang, kk)) 

897# for t in vv: 

898# if t not in tag_categories: 

899# raise AssertionError("{} key {!r} invalid tag category {}" 

900# .format(k, kk, t)) 

901# elif kk in ("genders", "numbers", "persons", "strengths", "voices"): 

902# if not vv: 

903# continue 

904# if not isinstance(vv, (list, tuple, set)): 

905# raise AssertionError("{} key {!r} must be list/tuple/set" 

906# .format(k, kk)) 

907# for t in vv: 

908# if t not in valid_tags: 

909# raise AssertionError("{} key {!r} invalid tag {!r}" 

910# .format(k, kk, t)) 

911# elif kk == "lang_tag_mappings" and vv is not None: 

912# for pos, transf in vv.items(): 

913# assert pos in PARTS_OF_SPEECH 

914# assert isinstance(transf, dict) 

915# for pre, post in transf.items(): 

916# assert isinstance(pre, tuple) 

917# assert all(t in valid_tags for t in pre) 

918# assert isinstance(post, list) 

919# assert all(t in valid_tags for t in post) 

920# elif kk == "next": 

921# if vv not in lang_specific: 

922# raise AssertionError("{} key {!r} value {!r} is not defined" 

923# .format(k, kk, vv)) 

924 

925 

926def get_lang_conf(lang, field): 

927 """Returns the given field from language-specific data or "default" 

928 if the language is not listed or does not have the field.""" 

929 assert isinstance(lang, str) 

930 assert isinstance(field, str) 

931 while True: 

932 lconfigs = lang_specific.get(lang) 

933 if lconfigs is None: 

934 lang = "default" 

935 elif lang == "default" and field not in lconfigs: 935 ↛ 936line 935 didn't jump to line 936 because the condition on line 935 was never true

936 raise RuntimeError("Invalid lang_specific field {!r}".format(field)) 

937 else: 

938 if field in lconfigs: 

939 return lconfigs[field] 

940 lang = lconfigs.get("next", "default") 

941 

942 

943def lang_specific_tags(lang, pos, form): 

944 """Extracts tags from the word form itself in a language-specific way. 

945 This may also adjust the word form. 

946 For example, German inflected verb forms don't have person and number 

947 specified in the table, but include a pronoun. This returns adjusted 

948 form and a list of tags.""" 

949 assert isinstance(lang, str) 

950 assert isinstance(pos, str) 

951 assert isinstance(form, str) 

952 rules = get_lang_conf(lang, "form_transformations") 

953 for patpos, pattern, dst, tags in rules: 

954 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > "" 

955 assert patpos in PARTS_OF_SPEECH 

956 if pos != patpos: 

957 continue 

958 m = re.search(pattern, form) 

959 if not m: 

960 continue 

961 form = form[: m.start()] + dst + form[m.end() :] 

962 tags = tags.split() 

963 for t in tags: 

964 assert t in valid_tags 

965 return form, tags 

966 return form, []