Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 96%
36 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1# Language-specific configuration for various aspects of inflection table
2# parsing.
4import re
5from typing import Optional, TypedDict, Union
7from ...tags import valid_tags
8from .parts_of_speech import PARTS_OF_SPEECH
10LangConfDict = TypedDict(
11 "LangConfDict",
12 {
13 "next": str,
14 "hdr_expand_first": set[str],
15 "hdr_expand_cont": set[str],
16 "animate_inanimate_remove": bool,
17 "both_active_passive_remove": bool,
18 "both_strong_weak_remove": bool,
19 "definitenesses": list[str],
20 "empty_row_resets": bool,
21 "form_transformations": list[
22 list[str]
23 ], # tag extraction, lang_specific_tags()
24 "genders": Optional[list[str]],
25 "imperative_no_tense": bool,
26 "masc_only_animate": bool, # Slavic special
27 "numbers": list[str],
28 "persons": list[str],
29 "pl_virile_nonvirile": bool,
30 "reuse_cellspan": str, # stop/skip/reuse
31 "skip_mood_mood": bool,
32 "skip_tense_tense": bool,
33 "stop_non_finite_non_finite": bool,
34 "stop_non_finite_voice": bool,
35 "stop_non_finite_tense": bool,
36 "strengths": list[str],
37 "virile_nonvirile_remove": bool,
38 "voices": list[str],
39 "special_phrase_splits": dict[
40 str, list[Union[list[str], str]]
41 ], # value: (split phrase, tags)
42 "form_replacements": dict[
43 str, Union[str, list[str]]
44 ], # value: [replacement, tags]
45 # Greek-style bracket semantics
46 "parentheses_for_informal": bool,
47 "square_brackets_for_rare": bool,
48 "curly_brackets_for_archaic": bool,
49 # Armenian; migrated old data here
50 "lang_tag_mappings": Optional[
51 dict[str, dict[tuple[str, ...], list[str]]]
52 ],
53 # Spanish has a lot of "vos" and "tú" in its tables that look like
54 # references, and they give their form certain tags.
55 # Dict of references ("vos") that point to tag strings "first-person
56 # singular" that *extend* tags.
57 "special_references": Optional[dict[str, str]],
58 # Some languages like Icelandic and Faroese have text cells in the
59 # upper left that we'd like to ignore.
60 "ignore_top_left_text_cell": bool,
61 # Minor regex replacements for cleanup in parse_simple_table()
62 "minor_text_cleanups": Optional[
63 dict[str, str]
64 ], # dict of {regex: substitution}
65 "articles_in_separate_columns": bool,
66 # Cells to ignore in this language, unless the cell has the key
67 # as a tag.
68 "conditionally_ignored_cells": dict[str, list[str]],
69 },
70 total=False,
71)
73lang_specific: dict[str, LangConfDict] = {
74 "default": {
75 "hdr_expand_first": set(
76 [
77 "number",
78 "mood",
79 "referent",
80 "aspect",
81 "tense",
82 "voice",
83 "non-finite",
84 "case",
85 "possession",
86 ]
87 ),
88 "hdr_expand_cont": set(
89 [
90 "person",
91 "gender",
92 "number",
93 "degree",
94 "polarity",
95 "voice",
96 "misc",
97 ]
98 ),
99 "animate_inanimate_remove": True,
100 "both_active_passive_remove": True,
101 "both_strong_weak_remove": True,
102 "definitenesses": ["indefinite", "definite"],
103 "empty_row_resets": False,
104 "form_transformations": [], # tag extraction, lang_specific_tags()
105 "genders": None,
106 "imperative_no_tense": False,
107 "masc_only_animate": False, # Slavic special
108 "numbers": ["singular", "plural"],
109 "persons": ["first-person", "second-person", "third-person"],
110 "pl_virile_nonvirile": False,
111 "reuse_cellspan": "skip", # stop/skip/reuse
112 "skip_mood_mood": False,
113 "skip_tense_tense": False,
114 "stop_non_finite_non_finite": True,
115 "stop_non_finite_voice": False,
116 "stop_non_finite_tense": False,
117 "strengths": ["strong", "weak"],
118 "virile_nonvirile_remove": True,
119 "voices": ["active", "passive"],
120 "special_phrase_splits": {}, # value: (split phrase, tags)
121 "form_replacements": {}, # value: [replacement, tags]
122 # Greek-style bracket semantics
123 "parentheses_for_informal": False,
124 "square_brackets_for_rare": False,
125 "curly_brackets_for_archaic": False,
126 # Armenian; migrated old data here
127 "lang_tag_mappings": None,
128 # Spanish has a lot of "vos" and "tú" in its tables that look like
129 # references, and they give their form certain tags.
130 # Dict of references ("vos") that point to tag strings "first-person
131 # singular" that *extend* tags.
132 "special_references": None,
133 # Some languages like Icelandic and Faroese have text cells in the
134 # upper left that we'd like to ignore.
135 "ignore_top_left_text_cell": False,
136 # Minor regex replacements for cleanup in parse_simple_table()
137 "minor_text_cleanups": None, # dict of {regex: substitution}
138 "articles_in_separate_columns": False,
139 # Cells to ignore in this language, unless the cell has the key
140 # as a tag.
141 "conditionally_ignored_cells": {},
142 },
143 "austronesian-group": {
144 "numbers": ["singular", "dual", "plural"],
145 },
146 "bantu-group": {
147 "genders": None,
148 },
149 "indo-european-group": {
150 "genders": ["masculine", "feminine", "neuter"],
151 "numbers": ["singular", "plural"],
152 },
153 "romance-group": {},
154 "slavic-group": {
155 "numbers": ["singular", "plural", "dual"],
156 "masc_only_animate": True,
157 },
158 "samojedic-group": {
159 "next": "uralic-group",
160 },
161 "semitic-group": {
162 "numbers": ["singular", "dual", "plural"],
163 "definitenesses": ["indefinite", "definite", "construct"],
164 },
165 "uralic-group": {
166 "numbers": ["singular", "dual", "plural"],
167 },
168 "german-group": { # languages closely related to or offshot from German
169 "next": "germanic-group",
170 "articles_in_separate_columns": True,
171 },
172 "germanic-group": { # Germanic languages as a whole
173 "next": "indo-european-group",
174 },
175 "Akkadian": {
176 "next": "semitic-group",
177 },
178 "Alemannic German": {
179 "next": "German",
180 },
181 "Amharic": {
182 "next": "semitic-group",
183 },
184 "Ancient Greek": {
185 "next": "Proto-Indo-European", # Has dual
186 "form_transformations": [
187 # Used to remove the gendered article alternatives at the start
188 # of table entries like ἰχθυοκένταυρος / Ancient Greek
189 ["noun", "^ὁ, ἡ ", "", ""],
190 ["noun", "^τὼ ", "", ""],
191 ["noun", "^οἱ, αἱ ", "", ""],
192 ["noun", "^τοῦ, τῆς ", "", ""],
193 ["noun", "^τοῖν ", "", ""],
194 ["noun", "^τῶν ", "", ""],
195 ["noun", "^τῷ, τῇ ", "", ""],
196 ["noun", "^τοῖς, ταῖς ", "", ""],
197 ["noun", "^τὸν, τὴν ", "", ""],
198 ["noun", "^τὼ ", "", ""],
199 ["noun", "^τοὺς, τᾱ̀ς ", "", ""],
200 ["noun", "(?m)^ho, hē ", "", ""],
201 ["noun", "(?m)^tṑ ", "", ""],
202 ["noun", "(?m)^hoi, hai ", "", ""],
203 ["noun", "(?m)^toû, tês", "", ""],
204 ["noun", "(?m)^toîn ", "", ""],
205 ["noun", "(?m)^tôn ", "", ""],
206 ["noun", "(?m)^tôi, têi ", "", ""],
207 ["noun", "(?m)^toîs, taîs ", "", ""],
208 ["noun", "(?m)^tòn, tḕn ", "", ""],
209 ["noun", "(?m)^tṑ ", "", ""],
210 ["noun", "(?m)^toùs, tā̀s ", "", ""],
211 ],
212 },
213 # "Anejom̃": {
214 # "numbers": ["singular", "dual", "trial", "plural"],
215 # },
216 "Arabic": {
217 "next": "semitic-group",
218 "numbers": [
219 "singular",
220 "dual",
221 "paucal",
222 "plural",
223 "collective",
224 "singulative",
225 ],
226 "reuse_cellspan": "reuse",
227 "hdr_expand_first": set(["number"]),
228 "hdr_expand_cont": set(
229 ["gender", "referent", "misc", "number", "class"]
230 ),
231 },
232 "Aragonese": {
233 "next": "romance-group",
234 },
235 "Armenian": {
236 "lang_tag_mappings": {
237 "noun": {
238 ("possessive", "singular"): ["possessive", "possessed-single"],
239 ("possessive", "plural"): ["possessive", "possessed-single"],
240 },
241 },
242 },
243 "Aromanian": {
244 "next": "romance-group",
245 },
246 "Aramaic": {
247 "next": "semitic-group",
248 },
249 "Avestan": {
250 "next": "Proto-Indo-European",
251 },
252 "Bavarian": {
253 "next": "German",
254 },
255 "Baiso": {
256 "numbers": ["singular", "paucal", "plural"],
257 },
258 "Belarusian": {
259 "next": "slavic-group",
260 },
261 "Bende": {
262 "next": "bantu-group",
263 },
264 # "Berber": {
265 # "definitenesses": ["indefinite", "definite", "construct"],
266 # },
267 "Catalan": {
268 "next": "romance-group",
269 },
270 "Chichewa": {
271 "next": "bantu-group",
272 },
273 "Chimwiini": {
274 "next": "bantu-group",
275 },
276 "Cimbrian": {
277 "next": "German",
278 },
279 "Corsican": {
280 "next": "romance-group",
281 },
282 "Czech": {
283 "next": "slavic-group",
284 "hdr_expand_first": set(["tense", "mood", "non-finite"]),
285 "hdr_expand_cont": set(["tense", "mood", "voice"]),
286 },
287 "Dalmatian": {
288 "next": "romance-group",
289 },
290 "Danish": {
291 "genders": ["common-gender", "feminine", "masculine", "neuter"],
292 "form_transformations": [
293 ["noun", r"^\(as a measure\) ", "", ""],
294 ],
295 },
296 "Eblaite": {
297 "next": "semitic-group",
298 },
299 "Egyptian": {
300 "definitenesses": ["indefinite", "definite", "construct"],
301 },
302 "Emilian": {
303 "next": "romance-group",
304 },
305 "English": {
306 "stop_non_finite_tense": True, # affect/English/Verb
307 "form_transformations": [
308 ["verb", r"^\(to\) ", "", ""],
309 ["verb", "^to ", "", ""],
310 ["verb", r"^I ", "", "first-person singular"],
311 ["verb", r"^you ", "", "second-person"],
312 ["verb", r"^he ", "", "third-person singular"],
313 ["verb", r"^we ", "", "first-person plural"],
314 ["verb", r"^they ", "", "third-person"],
315 ["verb", r"^it ", "", "third-person singular"],
316 ["verb", r"^thou ", "", "second-person singular"],
317 ["verb", r"^ye ", "", "second-person plural"],
318 ["verb", r" \(thou\)$", "", "second-person singular"],
319 ["verb", r" \(ye\)$", "", "second-person plural"],
320 ["verb", r"^he/she/it ", "", "third-person singular"],
321 ["verb", r"^he/she/it/they ", "", "third-person singular"],
322 ["verb", r"\bhim/her/it/them ", "", "third-person singular"],
323 ["verb", r"\bthem ", "", "third-person"],
324 ["verb", r"\bus ", "", "first-person plural"],
325 ["verb", r"\bme ", "", "first-person singular"],
326 ],
327 "form_replacements": {
328 "let’s be": ["let's be", "first-person plural pronoun-included"],
329 },
330 "special_phrase_splits": {
331 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"],
332 "we are (’re)/be/been": [
333 ["are (’re)", "be", "been"],
334 "first-person plural",
335 ],
336 "thou art (’rt)/beest": [
337 ["art (’rt)", "beest"],
338 "second-person singular",
339 ],
340 "ye are (’re)/be/been": [
341 ["are (’re)", "be", "been"],
342 "second-person plural",
343 ],
344 "thou be/beest": [["be", "beest"], "second-person singular"],
345 "he/she/it is (’s)/beeth/bes": [
346 ["is (’s)", "beeth", "bes"],
347 "third-person singular",
348 ],
349 "they are (’re)/be/been": [
350 ["are (’re)", "be", "been"],
351 "third-person plural",
352 ],
353 "thou wert/wast": [["wert", "wast"], "second-person singular"],
354 "thou were/wert": [["were", "wert"], "second-person singular"],
355 "there has been": [["there has been"], "singular"],
356 "there have been": [["there have been"], "plural"],
357 "there is ('s)": [["there is", "there's"], "singular"],
358 "there are ('re)": [["there are", "there're"], "plural"],
359 "there was": [["there was"], "singular"],
360 "there were": [["there were"], "plural"],
361 },
362 },
363 "Estonian": {
364 "hdr_expand_first": set(["non-finite"]),
365 "hdr_expand_cont": set(["voice"]),
366 },
367 "Faroese": {
368 "ignore_top_left_text_cell": True,
369 },
370 "Fijian": {
371 "numbers": ["singular", "paucal", "plural"],
372 },
373 "Finnish": {
374 "hdr_expand_first": set([]),
375 },
376 "French": {
377 "next": "romance-group",
378 },
379 "Friulian": {
380 "next": "romance-group",
381 },
382 "Galician": {
383 "next": "romance-group",
384 },
385 "German": {
386 "next": "german-group",
387 "form_transformations": [
388 ["verb", "^ich ", "", "first-person singular"],
389 ["verb", "^du ", "", "second-person singular"],
390 ["verb", "^er ", "", "third-person singular"],
391 ["verb", "^wir ", "", "first-person plural"],
392 ["verb", "^ihr ", "", "second-person plural"],
393 ["verb", "^sie ", "", "third-person plural"],
394 [
395 "verb",
396 "^dass ich ",
397 "",
398 "first-person singular subordinate-clause",
399 ],
400 [
401 "verb",
402 "^dass du ",
403 "",
404 "second-person singular subordinate-clause",
405 ],
406 [
407 "verb",
408 "^dass er ",
409 "",
410 "third-person singular subordinate-clause",
411 ],
412 [
413 "verb",
414 "^dass wir ",
415 "",
416 "first-person plural subordinate-clause",
417 ],
418 [
419 "verb",
420 "^dass ihr ",
421 "",
422 "second-person plural subordinate-clause",
423 ],
424 [
425 "verb",
426 "^dass sie ",
427 "",
428 "third-person plural subordinate-clause",
429 ],
430 ["verb", r" \(du\)$", "", "second-person singular"],
431 ["verb", r" \(ihr\)$", "", "second-person plural"],
432 ["adj", "^er ist ", "", "masculine singular"],
433 ["adj", "^sie ist ", "", "feminine singular"],
434 ["adj", "^es ist ", "", "neuter singular"],
435 ["adj", "^sie sind ", "", "plural"],
436 ["adj", "^keine ", "keine ", "negative"],
437 ["adj", "^keiner ", "keiner ", "negative"],
438 ["adj", "^keinen ", "keinen ", "negative"],
439 ],
440 "conditionally_ignored_cells": {
441 "definite": [
442 "der",
443 "die",
444 "das",
445 "des",
446 "dem",
447 "den",
448 ],
449 "indefinite": [
450 "ein",
451 "eine",
452 "eines",
453 "einer",
454 "einem",
455 "einen",
456 ],
457 "negative": [
458 "kein",
459 "keine",
460 "keiner",
461 "keinen",
462 ],
463 },
464 },
465 "German Low German": {
466 "next": "German",
467 "hdr_expand_first": set(["mood", "non-finite"]),
468 "hdr_expand_cont": set(["tense"]),
469 },
470 "Gothic": {
471 "next": "Proto-Indo-European", # Has dual
472 },
473 "Greek": {
474 "next": "indo-european-group",
475 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]),
476 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]),
477 "imperative_no_tense": True,
478 "reuse_cellspan": "reuse",
479 "skip_mood_mood": True,
480 "skip_tense_tense": True,
481 # είμαι/Greek
482 "parentheses_for_informal": True,
483 "square_brackets_for_rare": True,
484 "curly_brackets_for_archaic": True,
485 # For greek originally
486 "minor_text_cleanups": {
487 r"\s+➤\s*$": "",
488 },
489 },
490 "Hawaiian": {
491 "next": "austronesian-group",
492 },
493 "Hebrew": {
494 "next": "semitic-group",
495 },
496 "Hijazi Arabic": {
497 "next": "semitic-group",
498 },
499 "Hopi": {
500 "numbers": ["singular", "paucal", "plural"],
501 },
502 "Hungarian": {
503 "hdr_expand_first": set([]),
504 "hdr_expand_cont": set([]),
505 },
506 "Hunsrik": {
507 "next": "German",
508 },
509 "Icelandic": {
510 "ignore_top_left_text_cell": True,
511 },
512 "Ilokano": {
513 "next": "austronesian-group",
514 },
515 "Inari Sami": {
516 "next": "samojedic-group",
517 },
518 "Inuktitut": {
519 "numbers": ["singular", "dual", "plural"],
520 },
521 "Italian": {
522 "next": "romance-group",
523 "hdr_expand_first": set(["mood", "tense"]),
524 "hdr_expand_cont": set(["person", "register", "number", "misc"]),
525 "form_transformations": [
526 ["verb", "^non ", "", "negative"],
527 ],
528 },
529 "Irish": {
530 "next": "Old Irish",
531 "genders": ["masculine", "feminine"],
532 },
533 "Kamba": {
534 "next": "bantu-group",
535 },
536 "Kapampangan": {
537 "next": "austronesian-group",
538 },
539 # "Khoe": {
540 # "numbers": ["singular", "dual", "plural"],
541 # },
542 "Kikuyu": {
543 "next": "bantu-group",
544 },
545 "Ladin": {
546 "next": "romance-group",
547 },
548 # "Larike": {
549 # "numbers": ["singular", "dual", "trial", "plural"],
550 # },
551 "Latin": {
552 "next": "romance-group",
553 "stop_non_finite_voice": True,
554 },
555 "Latvian": {
556 "empty_row_resets": True,
557 },
558 "Ligurian": {
559 "next": "romance-group",
560 },
561 "Lihir": {
562 "numbers": ["singular", "dual", "trial", "paucal", "plural"],
563 },
564 "Lingala": {
565 "next": "bantu-group",
566 },
567 "Lombard": {
568 "next": "romance-group",
569 },
570 "Lower Sorbian": {
571 "next": "slavic-group",
572 },
573 "Luganda": {
574 "next": "bantu-group",
575 },
576 "Lule Sami": {
577 "next": "samojedic-group",
578 },
579 "Luxembourgish": {
580 "next": "German",
581 },
582 "Maltese": {
583 "next": "semitic-group",
584 },
585 "Maore Comorian": {
586 "next": "bantu-group",
587 },
588 "Masaba": {
589 "next": "bantu-group",
590 },
591 "Mirandese": {
592 "next": "romance-group",
593 },
594 "Moroccan Arabic": {
595 "next": "semitic-group",
596 },
597 # "Motuna": {
598 # "numbers": ["singular", "paucal", "plural"],
599 # },
600 "Mwali Comorian": {
601 "next": "bantu-group",
602 },
603 "Mwani": {
604 "next": "bantu-group",
605 },
606 "Navajo": {
607 "numbers": [
608 "singular",
609 "plural",
610 "dual",
611 "duoplural",
612 ],
613 },
614 "Neapolitan": {
615 "next": "romance-group",
616 },
617 "Nenets": {
618 "next": "uralic-group",
619 },
620 "Ngazidja Comorian": {
621 "next": "bantu-group",
622 },
623 "Niuean": {
624 "next": "austronesian-group",
625 },
626 "Northern Kurdish": {
627 "numbers": ["singular", "paucal", "plural"],
628 },
629 "Northern Ndebele": {
630 "next": "bantu-group",
631 },
632 "Northern Sami": {
633 "next": "samojedic-group",
634 },
635 # "Mussau": {
636 # "numbers": ["singular", "dual", "trial", "plural"],
637 # },
638 "Nyankole": {
639 "next": "bantu-group",
640 },
641 "Occitan": {
642 "next": "romance-group",
643 },
644 "Old Church Slavonic": {
645 "next": "Proto-Indo-European", # Has dual
646 },
647 "Old English": {
648 "next": "Proto-Indo-European", # Had dual in pronouns
649 },
650 "Old Norse": {
651 "next": "Proto-Indo-European", # Had dual in pronouns
652 },
653 "Old Irish": {
654 "next": "Proto-Indo-European", # Has dual
655 },
656 "Pennsylvania German": {
657 "next": "German",
658 },
659 "Phoenician": {
660 "next": "semitic-group",
661 },
662 "Phuthi": {
663 "next": "bantu-group",
664 },
665 "Pite Sami": {
666 "next": "samojedic-group",
667 },
668 "Polish": {
669 "next": "slavic-group",
670 },
671 "Portuguese": {
672 "next": "romance-group",
673 "genders": ["masculine", "feminine"],
674 },
675 "Proto-Germanic": {
676 "next": "Proto-Indo-European", # Has dual
677 },
678 "Proto-Indo-European": {
679 "numbers": ["singular", "dual", "plural"],
680 },
681 "Proto-Samic": {
682 "next": "samojedic-group",
683 },
684 "Proto-Uralic": {
685 "next": "uralic-group",
686 },
687 "Raga": {
688 "numbers": ["singular", "dual", "trial", "plural"],
689 },
690 "Romagnol": {
691 "next": "romance-group",
692 },
693 "Romanian": {
694 "next": "romance-group",
695 },
696 "Romansch": {
697 "next": "romance-group",
698 },
699 "Russian": {
700 "next": "slavic-group",
701 "hdr_expand_first": set(["non-finite", "mood", "tense"]),
702 "hdr_expand_cont": set(["tense", "number"]),
703 "reuse_cellspan": "stop",
704 },
705 "Rwanda-Rundi": {
706 "next": "bantu-group",
707 },
708 "Sanskrit": {
709 "next": "Proto-Indo-European",
710 },
711 "Sardinian": {
712 "next": "romance-group",
713 },
714 "Sassarese": {
715 "next": "romance-group",
716 },
717 "Scottish Gaelic": {
718 "numbers": ["singular", "dual", "plural"],
719 },
720 "Serbo-Croatian": {
721 "next": "slavic-group",
722 "numbers": ["singular", "dual", "paucal", "plural"],
723 },
724 "Sicilian": {
725 "next": "romance-group",
726 },
727 "Skolt Sami": {
728 "next": "samojedic-group",
729 },
730 "Slovene": {
731 "next": "slavic-group",
732 },
733 "Shona": {
734 "next": "bantu-group",
735 },
736 "Sotho": {
737 "next": "bantu-group",
738 },
739 "South Levantine Arabic": {
740 "next": "semitic-group",
741 },
742 "Southern Ndebele": {
743 "next": "bantu-group",
744 },
745 "Spanish": {
746 "next": "romance-group",
747 "form_transformations": [
748 ["verb", "^no ", "", "negative"],
749 ],
750 "special_references": {
751 "vos": "informal vos-form second-person singular",
752 "ᵛᵒˢ": "informal vos-form second-person singular",
753 "tú": "informal second-person singular",
754 },
755 },
756 "Swahili": {
757 "next": "bantu-group",
758 },
759 "Swedish": {
760 "hdr_expand_first": set(["referent"]),
761 "hdr_expand_cont": set(["degree", "polarity"]),
762 "genders": ["common-gender", "feminine", "masculine", "neuter"],
763 },
764 "Swazi": {
765 "next": "bantu-group",
766 },
767 # "Syriac": {
768 # "next": "semitic-group",
769 # },
770 "Tagalog": {
771 "next": "austronesian-group",
772 },
773 "Tausug": {
774 "next": "austronesian-group",
775 },
776 "Tigre": {
777 "next": "semitic-group",
778 },
779 "Tigrinya": {
780 "next": "semitic-group",
781 },
782 "Tongan": {
783 "next": "austronesian-group",
784 },
785 "Tsonga": {
786 "next": "bantu-group",
787 },
788 "Tswana": {
789 "next": "bantu-group",
790 },
791 "Tumbuka": {
792 "next": "bantu-group",
793 },
794 # "Tuscan": {
795 # "next": "romance-group",
796 # },
797 "Ugaritic": {
798 "next": "semitic-group",
799 },
800 "Ukrainian": {
801 "next": "slavic-group",
802 },
803 "Upper Sorbian": {
804 "next": "slavic-group",
805 },
806 # "Valencian": {
807 # "next": "romance-group",
808 # },
809 "Venetian": {
810 "next": "romance-group",
811 },
812 "Warlpiri": {
813 "numbers": ["singular", "paucal", "plural"],
814 },
815 "Xhosa": {
816 "next": "bantu-group",
817 },
818 "Zulu": {
819 "next": "bantu-group",
820 },
821 "ǃXóõ": {
822 "next": "bantu-group",
823 },
824}
827# Sanity check lang_specific
828# def_ls_keys = lang_specific["default"].keys()
829# for k, v in lang_specific.items():
830# if k[0].isupper() and k not in languages_by_name:
831# raise AssertionError(
832# "key {!r} in lang_specific is not a valid language"
833# .format(k))
834# assert isinstance(v, dict)
835# for kk, vv in v.items():
836# if kk not in def_ls_keys and kk != "next":
837# raise AssertionError("{} key {!r} not in default entry"
838# .format(k, kk))
839# if kk in ("hdr_expand_first", "hdr_expand_cont"):
840# if not isinstance(vv, set):
841# raise AssertionError("{} key {!r} must be set"
842# .format(lang, kk))
843# for t in vv:
844# if t not in tag_categories:
845# raise AssertionError("{} key {!r} invalid tag category {}"
846# .format(k, kk, t))
847# elif kk in ("genders", "numbers", "persons", "strengths", "voices"):
848# if not vv:
849# continue
850# if not isinstance(vv, (list, tuple, set)):
851# raise AssertionError("{} key {!r} must be list/tuple/set"
852# .format(k, kk))
853# for t in vv:
854# if t not in valid_tags:
855# raise AssertionError("{} key {!r} invalid tag {!r}"
856# .format(k, kk, t))
857# elif kk == "lang_tag_mappings" and vv is not None:
858# for pos, transf in vv.items():
859# assert pos in PARTS_OF_SPEECH
860# assert isinstance(transf, dict)
861# for pre, post in transf.items():
862# assert isinstance(pre, tuple)
863# assert all(t in valid_tags for t in pre)
864# assert isinstance(post, list)
865# assert all(t in valid_tags for t in post)
866# elif kk == "next":
867# if vv not in lang_specific:
868# raise AssertionError("{} key {!r} value {!r} is not defined"
869# .format(k, kk, vv))
872def get_lang_conf(lang, field):
873 """Returns the given field from language-specific data or "default"
874 if the language is not listed or does not have the field."""
875 assert isinstance(lang, str)
876 assert isinstance(field, str)
877 while True:
878 lconfigs = lang_specific.get(lang)
879 if lconfigs is None:
880 lang = "default"
881 elif lang == "default" and field not in lconfigs: 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true
882 raise RuntimeError("Invalid lang_specific field {!r}".format(field))
883 else:
884 if field in lconfigs:
885 return lconfigs[field]
886 lang = lconfigs.get("next", "default")
889def lang_specific_tags(lang, pos, form):
890 """Extracts tags from the word form itself in a language-specific way.
891 This may also adjust the word form.
892 For example, German inflected verb forms don't have person and number
893 specified in the table, but include a pronoun. This returns adjusted
894 form and a list of tags."""
895 assert isinstance(lang, str)
896 assert isinstance(pos, str)
897 assert isinstance(form, str)
898 rules = get_lang_conf(lang, "form_transformations")
899 for patpos, pattern, dst, tags in rules:
900 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > ""
901 assert patpos in PARTS_OF_SPEECH
902 if pos != patpos:
903 continue
904 m = re.search(pattern, form)
905 if not m:
906 continue
907 form = form[: m.start()] + dst + form[m.end() :]
908 tags = tags.split()
909 for t in tags:
910 assert t in valid_tags
911 return form, tags
912 return form, []