Coverage for src/wiktextract/extractor/en/lang_specific

1# Language-specific configuration for various aspects of inflection table

2# parsing.

4import re

5from typing import Optional, TypedDict, Union

7from ...tags import valid_tags

8from .parts_of_speech import PARTS_OF_SPEECH

10LangConfDict = TypedDict(

11 "LangConfDict",

12 {

13 "next": str,

14 "hdr_expand_first": set[str],

15 "hdr_expand_cont": set[str],

16 "animate_inanimate_remove": bool,

17 "both_active_passive_remove": bool,

18 "both_strong_weak_remove": bool,

19 "definitenesses": list[str],

20 "empty_row_resets": bool,

21 "form_transformations": list[

22 list[str]

23 ], # tag extraction, lang_specific_tags()

24 "genders": Optional[list[str]],

25 "imperative_no_tense": bool,

26 "masc_only_animate": bool, # Slavic special

27 "numbers": list[str],

28 "persons": list[str],

29 "pl_virile_nonvirile": bool,

30 "reuse_cellspan": str, # stop/skip/reuse

31 "skip_mood_mood": bool,

32 "skip_tense_tense": bool,

33 "stop_non_finite_non_finite": bool,

34 "stop_non_finite_voice": bool,

35 "stop_non_finite_tense": bool,

36 "strengths": list[str],

37 "virile_nonvirile_remove": bool,

38 "voices": list[str],

39 "special_phrase_splits": dict[

40 str, list[Union[list[str], str]]

41 ], # value: (split phrase, tags)

42 "form_replacements": dict[

43 str, Union[str, list[str]]

44 ], # value: [replacement, tags]

45 # Greek-style bracket semantics

46 "parentheses_for_informal": bool,

47 "square_brackets_for_rare": bool,

48 "curly_brackets_for_archaic": bool,

49 # Armenian; migrated old data here

50 "lang_tag_mappings": Optional[

51 dict[str, dict[tuple[str, ...], list[str]]]

52 ],

53 # Spanish has a lot of "vos" and "tú" in its tables that look like

54 # references, and they give their form certain tags.

55 # Dict of references ("vos") that point to tag strings "first-person

56 # singular" that *extend* tags.

57 "special_references": Optional[dict[str, str]],

58 # Some languages like Icelandic and Faroese have text cells in the

59 # upper left that we'd like to ignore.

60 "ignore_top_left_text_cell": bool,

61 # Minor regex replacements for cleanup in parse_simple_table()

62 "minor_text_cleanups": Optional[

63 dict[str, str]

64 ], # dict of {regex: substitution}

65 "articles_in_separate_columns": bool,

66 # Cells to ignore in this language, unless the cell has the key

67 # as a tag.

68 "conditionally_ignored_cells": dict[str, list[str]],

69 },

70 total=False,

71)

73lang_specific: dict[str, LangConfDict] = {

74 "default": {

75 "hdr_expand_first": set(

76 [

77 "number",

78 "mood",

79 "referent",

80 "aspect",

81 "tense",

82 "voice",

83 "non-finite",

84 "case",

85 "possession",

86 ]

87 ),

88 "hdr_expand_cont": set(

89 [

90 "person",

91 "gender",

92 "number",

93 "degree",

94 "polarity",

95 "voice",

96 "misc",

97 ]

98 ),

99 "animate_inanimate_remove": True,

100 "both_active_passive_remove": True,

101 "both_strong_weak_remove": True,

102 "definitenesses": ["indefinite", "definite"],

103 "empty_row_resets": False,

104 "form_transformations": [], # tag extraction, lang_specific_tags()

105 "genders": None,

106 "imperative_no_tense": False,

107 "masc_only_animate": False, # Slavic special

108 "numbers": ["singular", "plural"],

109 "persons": ["first-person", "second-person", "third-person"],

110 "pl_virile_nonvirile": False,

111 "reuse_cellspan": "skip", # stop/skip/reuse

112 "skip_mood_mood": False,

113 "skip_tense_tense": False,

114 "stop_non_finite_non_finite": True,

115 "stop_non_finite_voice": False,

116 "stop_non_finite_tense": False,

117 "strengths": ["strong", "weak"],

118 "virile_nonvirile_remove": True,

119 "voices": ["active", "passive"],

120 "special_phrase_splits": {}, # value: (split phrase, tags)

121 "form_replacements": {}, # value: [replacement, tags]

122 # Greek-style bracket semantics

123 "parentheses_for_informal": False,

124 "square_brackets_for_rare": False,

125 "curly_brackets_for_archaic": False,

126 # Armenian; migrated old data here

127 "lang_tag_mappings": None,

128 # Spanish has a lot of "vos" and "tú" in its tables that look like

129 # references, and they give their form certain tags.

130 # Dict of references ("vos") that point to tag strings "first-person

131 # singular" that *extend* tags.

132 "special_references": None,

133 # Some languages like Icelandic and Faroese have text cells in the

134 # upper left that we'd like to ignore.

135 "ignore_top_left_text_cell": False,

136 # Minor regex replacements for cleanup in parse_simple_table()

137 "minor_text_cleanups": None, # dict of {regex: substitution}

138 "articles_in_separate_columns": False,

139 # Cells to ignore in this language, unless the cell has the key

140 # as a tag.

141 "conditionally_ignored_cells": {},

142 },

143 "austronesian-group": {

144 "numbers": ["singular", "dual", "plural"],

145 },

146 "bantu-group": {

147 "genders": None,

148 },

149 "indo-european-group": {

150 "genders": ["masculine", "feminine", "neuter"],

151 "numbers": ["singular", "plural"],

152 },

153 "romance-group": {},

154 "slavic-group": {

155 "numbers": ["singular", "plural", "dual"],

156 "masc_only_animate": True,

157 },

158 "samojedic-group": {

159 "next": "uralic-group",

160 },

161 "semitic-group": {

162 "numbers": ["singular", "dual", "plural"],

163 "definitenesses": ["indefinite", "definite", "construct"],

164 },

165 "uralic-group": {

166 "numbers": ["singular", "dual", "plural"],

167 },

168 "german-group": { # languages closely related to or offshot from German

169 "next": "germanic-group",

170 "articles_in_separate_columns": True,

171 },

172 "germanic-group": { # Germanic languages as a whole

173 "next": "indo-european-group",

174 },

175 "Akkadian": {

176 "next": "semitic-group",

177 },

178 "Alemannic German": {

179 "next": "German",

180 },

181 "Amharic": {

182 "next": "semitic-group",

183 },

184 "Ancient Greek": {

185 "next": "Proto-Indo-European", # Has dual

186 "form_transformations": [

187 # Used to remove the gendered article alternatives at the start

188 # of table entries like ἰχθυοκένταυρος / Ancient Greek

189 ["noun", "^ὁ, ἡ ", "", ""],

190 ["noun", "^τὼ ", "", ""],

191 ["noun", "^οἱ, αἱ ", "", ""],

192 ["noun", "^τοῦ, τῆς ", "", ""],

193 ["noun", "^τοῖν ", "", ""],

194 ["noun", "^τῶν ", "", ""],

195 ["noun", "^τῷ, τῇ ", "", ""],

196 ["noun", "^τοῖς, ταῖς ", "", ""],

197 ["noun", "^τὸν, τὴν ", "", ""],

198 ["noun", "^τὼ ", "", ""],

199 ["noun", "^τοὺς, τᾱ̀ς ", "", ""],

200 ["noun", "(?m)^ho, hē ", "", ""],

201 ["noun", "(?m)^tṑ ", "", ""],

202 ["noun", "(?m)^hoi, hai ", "", ""],

203 ["noun", "(?m)^toû, tês", "", ""],

204 ["noun", "(?m)^toîn ", "", ""],

205 ["noun", "(?m)^tôn ", "", ""],

206 ["noun", "(?m)^tôi, têi ", "", ""],

207 ["noun", "(?m)^toîs, taîs ", "", ""],

208 ["noun", "(?m)^tòn, tḕn ", "", ""],

209 ["noun", "(?m)^tṑ ", "", ""],

210 ["noun", "(?m)^toùs, tā̀s ", "", ""],

211 ],

212 },

213 # "Anejom̃": {

214 # "numbers": ["singular", "dual", "trial", "plural"],

215 # },

216 "Arabic": {

217 "next": "semitic-group",

218 "numbers": [

219 "singular",

220 "dual",

221 "paucal",

222 "plural",

223 "collective",

224 "singulative",

225 ],

226 "reuse_cellspan": "reuse",

227 "hdr_expand_first": set(["number"]),

228 "hdr_expand_cont": set(

229 ["gender", "referent", "misc", "number", "class"]

230 ),

231 },

232 "Aragonese": {

233 "next": "romance-group",

234 },

235 "Armenian": {

236 "lang_tag_mappings": {

237 "noun": {

238 ("possessive", "singular"): ["possessive", "possessed-single"],

239 ("possessive", "plural"): ["possessive", "possessed-single"],

240 },

241 },

242 },

243 "Aromanian": {

244 "next": "romance-group",

245 },

246 "Aramaic": {

247 "next": "semitic-group",

248 },

249 "Avestan": {

250 "next": "Proto-Indo-European",

251 },

252 "Bavarian": {

253 "next": "German",

254 },

255 "Baiso": {

256 "numbers": ["singular", "paucal", "plural"],

257 },

258 "Belarusian": {

259 "next": "slavic-group",

260 },

261 "Bende": {

262 "next": "bantu-group",

263 },

264 # "Berber": {

265 # "definitenesses": ["indefinite", "definite", "construct"],

266 # },

267 "Catalan": {

268 "next": "romance-group",

269 },

270 "Chichewa": {

271 "next": "bantu-group",

272 },

273 "Chimwiini": {

274 "next": "bantu-group",

275 },

276 "Cimbrian": {

277 "next": "German",

278 },

279 "Corsican": {

280 "next": "romance-group",

281 },

282 "Czech": {

283 "next": "slavic-group",

284 "hdr_expand_first": set(["tense", "mood", "non-finite"]),

285 "hdr_expand_cont": set(["tense", "mood", "voice"]),

286 },

287 "Dalmatian": {

288 "next": "romance-group",

289 },

290 "Danish": {

291 "genders": ["common-gender", "feminine", "masculine", "neuter"],

292 "form_transformations": [

293 ["noun", r"^$as a measure$ ", "", ""],

294 ],

295 },

296 "Eblaite": {

297 "next": "semitic-group",

298 },

299 "Egyptian": {

300 "definitenesses": ["indefinite", "definite", "construct"],

301 },

302 "Emilian": {

303 "next": "romance-group",

304 },

305 "English": {

306 "stop_non_finite_tense": True, # affect/English/Verb

307 "form_transformations": [

308 ["verb", r"^$to$ ", "", ""],

309 ["verb", "^to ", "", ""],

310 ["verb", r"^I ", "", "first-person singular"],

311 ["verb", r"^you ", "", "second-person"],

312 ["verb", r"^he ", "", "third-person singular"],

313 ["verb", r"^we ", "", "first-person plural"],

314 ["verb", r"^they ", "", "third-person"],

315 ["verb", r"^it ", "", "third-person singular"],

316 ["verb", r"^thou ", "", "second-person singular"],

317 ["verb", r"^ye ", "", "second-person plural"],

318 ["verb", r" $thou$$", "", "second-person singular"],

319 ["verb", r" $ye$$", "", "second-person plural"],

320 ["verb", r"^he/she/it ", "", "third-person singular"],

321 ["verb", r"^he/she/it/they ", "", "third-person singular"],

322 ["verb", r"\bhim/her/it/them ", "", "third-person singular"],

323 ["verb", r"\bthem ", "", "third-person"],

324 ["verb", r"\bus ", "", "first-person plural"],

325 ["verb", r"\bme ", "", "first-person singular"],

326 ],

327 "form_replacements": {

328 "let’s be": ["let's be", "first-person plural pronoun-included"],

329 },

330 "special_phrase_splits": {

331 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"],

332 "we are (’re)/be/been": [

333 ["are (’re)", "be", "been"],

334 "first-person plural",

335 ],

336 "thou art (’rt)/beest": [

337 ["art (’rt)", "beest"],

338 "second-person singular",

339 ],

340 "ye are (’re)/be/been": [

341 ["are (’re)", "be", "been"],

342 "second-person plural",

343 ],

344 "thou be/beest": [["be", "beest"], "second-person singular"],

345 "he/she/it is (’s)/beeth/bes": [

346 ["is (’s)", "beeth", "bes"],

347 "third-person singular",

348 ],

349 "they are (’re)/be/been": [

350 ["are (’re)", "be", "been"],

351 "third-person plural",

352 ],

353 "thou wert/wast": [["wert", "wast"], "second-person singular"],

354 "thou were/wert": [["were", "wert"], "second-person singular"],

355 "there has been": [["there has been"], "singular"],

356 "there have been": [["there have been"], "plural"],

357 "there is ('s)": [["there is", "there's"], "singular"],

358 "there are ('re)": [["there are", "there're"], "plural"],

359 "there was": [["there was"], "singular"],

360 "there were": [["there were"], "plural"],

361 },

362 },

363 "Estonian": {

364 "hdr_expand_first": set(["non-finite"]),

365 "hdr_expand_cont": set(["voice"]),

366 },

367 "Faroese": {

368 "ignore_top_left_text_cell": True,

369 },

370 "Fijian": {

371 "numbers": ["singular", "paucal", "plural"],

372 },

373 "Finnish": {

374 "hdr_expand_first": set([]),

375 },

376 "French": {

377 "next": "romance-group",

378 },

379 "Friulian": {

380 "next": "romance-group",

381 },

382 "Galician": {

383 "next": "romance-group",

384 },

385 "German": {

386 "next": "german-group",

387 "form_transformations": [

388 ["verb", "^ich ", "", "first-person singular"],

389 ["verb", "^du ", "", "second-person singular"],

390 ["verb", "^er ", "", "third-person singular"],

391 ["verb", "^wir ", "", "first-person plural"],

392 ["verb", "^ihr ", "", "second-person plural"],

393 ["verb", "^sie ", "", "third-person plural"],

394 [

395 "verb",

396 "^dass ich ",

397 "",

398 "first-person singular subordinate-clause",

399 ],

400 [

401 "verb",

402 "^dass du ",

403 "",

404 "second-person singular subordinate-clause",

405 ],

406 [

407 "verb",

408 "^dass er ",

409 "",

410 "third-person singular subordinate-clause",

411 ],

412 [

413 "verb",

414 "^dass wir ",

415 "",

416 "first-person plural subordinate-clause",

417 ],

418 [

419 "verb",

420 "^dass ihr ",

421 "",

422 "second-person plural subordinate-clause",

423 ],

424 [

425 "verb",

426 "^dass sie ",

427 "",

428 "third-person plural subordinate-clause",

429 ],

430 ["verb", r" $du$$", "", "second-person singular"],

431 ["verb", r" $ihr$$", "", "second-person plural"],

432 ["adj", "^er ist ", "", "masculine singular"],

433 ["adj", "^sie ist ", "", "feminine singular"],

434 ["adj", "^es ist ", "", "neuter singular"],

435 ["adj", "^sie sind ", "", "plural"],

436 ["adj", "^keine ", "keine ", "negative"],

437 ["adj", "^keiner ", "keiner ", "negative"],

438 ["adj", "^keinen ", "keinen ", "negative"],

439 ],

440 "conditionally_ignored_cells": {

441 "definite": [

442 "der",

443 "die",

444 "das",

445 "des",

446 "dem",

447 "den",

448 ],

449 "indefinite": [

450 "ein",

451 "eine",

452 "eines",

453 "einer",

454 "einem",

455 "einen",

456 ],

457 "negative": [

458 "kein",

459 "keine",

460 "keiner",

461 "keinen",

462 ],

463 },

464 },

465 "German Low German": {

466 "next": "German",

467 "hdr_expand_first": set(["mood", "non-finite"]),

468 "hdr_expand_cont": set(["tense"]),

469 },

470 "Gothic": {

471 "next": "Proto-Indo-European", # Has dual

472 },

473 "Greek": {

474 "next": "indo-european-group",

475 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]),

476 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]),

477 "imperative_no_tense": True,

478 "reuse_cellspan": "reuse",

479 "skip_mood_mood": True,

480 "skip_tense_tense": True,

481 # είμαι/Greek

482 "parentheses_for_informal": True,

483 "square_brackets_for_rare": True,

484 "curly_brackets_for_archaic": True,

485 # For greek originally

486 "minor_text_cleanups": {

487 r"\s+➤\s*$": "",

488 },

489 },

490 "Hawaiian": {

491 "next": "austronesian-group",

492 },

493 "Hebrew": {

494 "next": "semitic-group",

495 },

496 "Hijazi Arabic": {

497 "next": "semitic-group",

498 },

499 "Hopi": {

500 "numbers": ["singular", "paucal", "plural"],

501 },

502 "Hungarian": {

503 "hdr_expand_first": set([]),

504 "hdr_expand_cont": set([]),

505 },

506 "Hunsrik": {

507 "next": "German",

508 },

509 "Icelandic": {

510 "ignore_top_left_text_cell": True,

511 },

512 "Ilokano": {

513 "next": "austronesian-group",

514 },

515 "Inari Sami": {

516 "next": "samojedic-group",

517 },

518 "Inuktitut": {

519 "numbers": ["singular", "dual", "plural"],

520 },

521 "Italian": {

522 "next": "romance-group",

523 "hdr_expand_first": set(["mood", "tense"]),

524 "hdr_expand_cont": set(["person", "register", "number", "misc"]),

525 "form_transformations": [

526 ["verb", "^non ", "", "negative"],

527 ],

528 },

529 "Irish": {

530 "next": "Old Irish",

531 "genders": ["masculine", "feminine"],

532 },

533 "Kamba": {

534 "next": "bantu-group",

535 },

536 "Kapampangan": {

537 "next": "austronesian-group",

538 },

539 # "Khoe": {

540 # "numbers": ["singular", "dual", "plural"],

541 # },

542 "Kikuyu": {

543 "next": "bantu-group",

544 },

545 "Ladin": {

546 "next": "romance-group",

547 },

548 # "Larike": {

549 # "numbers": ["singular", "dual", "trial", "plural"],

550 # },

551 "Latin": {

552 "next": "romance-group",

553 "stop_non_finite_voice": True,

554 },

555 "Latvian": {

556 "empty_row_resets": True,

557 },

558 "Ligurian": {

559 "next": "romance-group",

560 },

561 "Lihir": {

562 "numbers": ["singular", "dual", "trial", "paucal", "plural"],

563 },

564 "Lingala": {

565 "next": "bantu-group",

566 },

567 "Lombard": {

568 "next": "romance-group",

569 },

570 "Lower Sorbian": {

571 "next": "slavic-group",

572 },

573 "Luganda": {

574 "next": "bantu-group",

575 },

576 "Lule Sami": {

577 "next": "samojedic-group",

578 },

579 "Luxembourgish": {

580 "next": "German",

581 },

582 "Maltese": {

583 "next": "semitic-group",

584 },

585 "Maore Comorian": {

586 "next": "bantu-group",

587 },

588 "Masaba": {

589 "next": "bantu-group",

590 },

591 "Mirandese": {

592 "next": "romance-group",

593 },

594 "Moroccan Arabic": {

595 "next": "semitic-group",

596 },

597 # "Motuna": {

598 # "numbers": ["singular", "paucal", "plural"],

599 # },

600 "Mwali Comorian": {

601 "next": "bantu-group",

602 },

603 "Mwani": {

604 "next": "bantu-group",

605 },

606 "Navajo": {

607 "numbers": [

608 "singular",

609 "plural",

610 "dual",

611 "duoplural",

612 ],

613 },

614 "Neapolitan": {

615 "next": "romance-group",

616 },

617 "Nenets": {

618 "next": "uralic-group",

619 },

620 "Ngazidja Comorian": {

621 "next": "bantu-group",

622 },

623 "Niuean": {

624 "next": "austronesian-group",

625 },

626 "Northern Kurdish": {

627 "numbers": ["singular", "paucal", "plural"],

628 },

629 "Northern Ndebele": {

630 "next": "bantu-group",

631 },

632 "Northern Sami": {

633 "next": "samojedic-group",

634 },

635 # "Mussau": {

636 # "numbers": ["singular", "dual", "trial", "plural"],

637 # },

638 "Nyankole": {

639 "next": "bantu-group",

640 },

641 "Occitan": {

642 "next": "romance-group",

643 },

644 "Old Church Slavonic": {

645 "next": "Proto-Indo-European", # Has dual

646 },

647 "Old English": {

648 "next": "Proto-Indo-European", # Had dual in pronouns

649 },

650 "Old Norse": {

651 "next": "Proto-Indo-European", # Had dual in pronouns

652 },

653 "Old Irish": {

654 "next": "Proto-Indo-European", # Has dual

655 },

656 "Pennsylvania German": {

657 "next": "German",

658 },

659 "Phoenician": {

660 "next": "semitic-group",

661 },

662 "Phuthi": {

663 "next": "bantu-group",

664 },

665 "Pite Sami": {

666 "next": "samojedic-group",

667 },

668 "Polish": {

669 "next": "slavic-group",

670 },

671 "Portuguese": {

672 "next": "romance-group",

673 "genders": ["masculine", "feminine"],

674 },

675 "Proto-Germanic": {

676 "next": "Proto-Indo-European", # Has dual

677 },

678 "Proto-Indo-European": {

679 "numbers": ["singular", "dual", "plural"],

680 },

681 "Proto-Samic": {

682 "next": "samojedic-group",

683 },

684 "Proto-Uralic": {

685 "next": "uralic-group",

686 },

687 "Raga": {

688 "numbers": ["singular", "dual", "trial", "plural"],

689 },

690 "Romagnol": {

691 "next": "romance-group",

692 },

693 "Romanian": {

694 "next": "romance-group",

695 },

696 "Romansch": {

697 "next": "romance-group",

698 },

699 "Russian": {

700 "next": "slavic-group",

701 "hdr_expand_first": set(["non-finite", "mood", "tense"]),

702 "hdr_expand_cont": set(["tense", "number"]),

703 "reuse_cellspan": "stop",

704 },

705 "Rwanda-Rundi": {

706 "next": "bantu-group",

707 },

708 "Sanskrit": {

709 "next": "Proto-Indo-European",

710 },

711 "Sardinian": {

712 "next": "romance-group",

713 },

714 "Sassarese": {

715 "next": "romance-group",

716 },

717 "Scottish Gaelic": {

718 "numbers": ["singular", "dual", "plural"],

719 },

720 "Serbo-Croatian": {

721 "next": "slavic-group",

722 "numbers": ["singular", "dual", "paucal", "plural"],

723 },

724 "Sicilian": {

725 "next": "romance-group",

726 },

727 "Skolt Sami": {

728 "next": "samojedic-group",

729 },

730 "Slovene": {

731 "next": "slavic-group",

732 },

733 "Shona": {

734 "next": "bantu-group",

735 },

736 "Sotho": {

737 "next": "bantu-group",

738 },

739 "South Levantine Arabic": {

740 "next": "semitic-group",

741 },

742 "Southern Ndebele": {

743 "next": "bantu-group",

744 },

745 "Spanish": {

746 "next": "romance-group",

747 "form_transformations": [

748 ["verb", "^no ", "", "negative"],

749 ],

750 "special_references": {

751 "vos": "informal vos-form second-person singular",

752 "ᵛᵒˢ": "informal vos-form second-person singular",

753 "tú": "informal second-person singular",

754 },

755 },

756 "Swahili": {

757 "next": "bantu-group",

758 },

759 "Swedish": {

760 "hdr_expand_first": set(["referent"]),

761 "hdr_expand_cont": set(["degree", "polarity"]),

762 "genders": ["common-gender", "feminine", "masculine", "neuter"],

763 },

764 "Swazi": {

765 "next": "bantu-group",

766 },

767 # "Syriac": {

768 # "next": "semitic-group",

769 # },

770 "Tagalog": {

771 "next": "austronesian-group",

772 },

773 "Tausug": {

774 "next": "austronesian-group",

775 },

776 "Tigre": {

777 "next": "semitic-group",

778 },

779 "Tigrinya": {

780 "next": "semitic-group",

781 },

782 "Tongan": {

783 "next": "austronesian-group",

784 },

785 "Tsonga": {

786 "next": "bantu-group",

787 },

788 "Tswana": {

789 "next": "bantu-group",

790 },

791 "Tumbuka": {

792 "next": "bantu-group",

793 },

794 # "Tuscan": {

795 # "next": "romance-group",

796 # },

797 "Ugaritic": {

798 "next": "semitic-group",

799 },

800 "Ukrainian": {

801 "next": "slavic-group",

802 },

803 "Upper Sorbian": {

804 "next": "slavic-group",

805 },

806 # "Valencian": {

807 # "next": "romance-group",

808 # },

809 "Venetian": {

810 "next": "romance-group",

811 },

812 "Warlpiri": {

813 "numbers": ["singular", "paucal", "plural"],

814 },

815 "Xhosa": {

816 "next": "bantu-group",

817 },

818 "Zulu": {

819 "next": "bantu-group",

820 },

821 "ǃXóõ": {

822 "next": "bantu-group",

823 },

824}

825

826

827# Sanity check lang_specific

828# def_ls_keys = lang_specific["default"].keys()

829# for k, v in lang_specific.items():

830# if k[0].isupper() and k not in languages_by_name:

831# raise AssertionError(

832# "key {!r} in lang_specific is not a valid language"

833# .format(k))

834# assert isinstance(v, dict)

835# for kk, vv in v.items():

836# if kk not in def_ls_keys and kk != "next":

837# raise AssertionError("{} key {!r} not in default entry"

838# .format(k, kk))

839# if kk in ("hdr_expand_first", "hdr_expand_cont"):

840# if not isinstance(vv, set):

841# raise AssertionError("{} key {!r} must be set"

842# .format(lang, kk))

843# for t in vv:

844# if t not in tag_categories:

845# raise AssertionError("{} key {!r} invalid tag category {}"

846# .format(k, kk, t))

847# elif kk in ("genders", "numbers", "persons", "strengths", "voices"):

848# if not vv:

849# continue

850# if not isinstance(vv, (list, tuple, set)):

851# raise AssertionError("{} key {!r} must be list/tuple/set"

852# .format(k, kk))

853# for t in vv:

854# if t not in valid_tags:

855# raise AssertionError("{} key {!r} invalid tag {!r}"

856# .format(k, kk, t))

857# elif kk == "lang_tag_mappings" and vv is not None:

858# for pos, transf in vv.items():

859# assert pos in PARTS_OF_SPEECH

860# assert isinstance(transf, dict)

861# for pre, post in transf.items():

862# assert isinstance(pre, tuple)

863# assert all(t in valid_tags for t in pre)

864# assert isinstance(post, list)

865# assert all(t in valid_tags for t in post)

866# elif kk == "next":

867# if vv not in lang_specific:

868# raise AssertionError("{} key {!r} value {!r} is not defined"

869# .format(k, kk, vv))

870

871

872def get_lang_conf(lang, field):

873 """Returns the given field from language-specific data or "default"

874 if the language is not listed or does not have the field."""

875 assert isinstance(lang, str)

876 assert isinstance(field, str)

877 while True:

878 lconfigs = lang_specific.get(lang)

879 if lconfigs is None:

880 lang = "default"

881 elif lang == "default" and field not in lconfigs: 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true

882 raise RuntimeError("Invalid lang_specific field {!r}".format(field))

883 else:

884 if field in lconfigs:

885 return lconfigs[field]

886 lang = lconfigs.get("next", "default")

887

888

889def lang_specific_tags(lang, pos, form):

890 """Extracts tags from the word form itself in a language-specific way.

891 This may also adjust the word form.

892 For example, German inflected verb forms don't have person and number

893 specified in the table, but include a pronoun. This returns adjusted

894 form and a list of tags."""

895 assert isinstance(lang, str)

896 assert isinstance(pos, str)

897 assert isinstance(form, str)

898 rules = get_lang_conf(lang, "form_transformations")

899 for patpos, pattern, dst, tags in rules:

900 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > ""

901 assert patpos in PARTS_OF_SPEECH

902 if pos != patpos:

903 continue

904 m = re.search(pattern, form)

905 if not m:

906 continue

907 form = form[: m.start()] + dst + form[m.end() :]

908 tags = tags.split()

909 for t in tags:

910 assert t in valid_tags

911 return form, tags

912 return form, []

Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 96%

36 statements