Coverage for src / wiktextract / extractor / en / lang_specific_configs.py: 82%
41 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 08:47 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 08:47 +0000
1# Language-specific configuration for various aspects of inflection table
2# parsing.
4import re
5from typing import Optional, TypedDict, Union
7from ...tags import valid_tags
8from .parts_of_speech import PARTS_OF_SPEECH
10LangConfDict = TypedDict(
11 "LangConfDict",
12 {
13 "next": str,
14 "hdr_expand_first": set[str],
15 "hdr_expand_cont": set[str],
16 "animate_inanimate_remove": bool,
17 "both_active_passive_remove": bool,
18 "both_strong_weak_remove": bool,
19 "definitenesses": list[str],
20 "empty_row_resets": bool,
21 "form_transformations": list[
22 # POS, pattern, replacement, tags
23 tuple[str, str, str, str]
24 ], # tag extraction, lang_specific_tags()
25 "genders": Optional[list[str]],
26 "imperative_no_tense": bool,
27 "masc_only_animate": bool, # Slavic special
28 "numbers": list[str],
29 "persons": list[str],
30 "pl_virile_nonvirile": bool,
31 "reuse_cellspan": str, # stop/skip/reuse
32 "skip_mood_mood": bool,
33 "skip_tense_tense": bool,
34 "stop_non_finite_non_finite": bool,
35 "stop_non_finite_voice": bool,
36 "stop_non_finite_tense": bool,
37 "strengths": list[str],
38 "virile_nonvirile_remove": bool,
39 "voices": list[str],
40 "special_phrase_splits": dict[
41 str, list[Union[list[str], str]]
42 ], # value: (split phrase, tags)
43 "form_replacements": dict[
44 str, Union[str, list[str]]
45 ], # value: [replacement, tags]
46 # Greek-style bracket semantics
47 "parentheses_for_informal": bool,
48 "square_brackets_for_rare": bool,
49 "curly_brackets_for_archaic": bool,
50 # Armenian; migrated old data here
51 "lang_tag_mappings": Optional[
52 dict[str, dict[tuple[str, ...], list[str]]]
53 ],
54 # Spanish has a lot of "vos" and "tú" in its tables that look like
55 # references, and they give their form certain tags.
56 # Dict of references ("vos") that point to tag strings "first-person
57 # singular" that *extend* tags.
58 "special_references": Optional[dict[str, str]],
59 # Some languages like Icelandic and Faroese have text cells in the
60 # upper left that we'd like to ignore.
61 "ignore_top_left_text_cell": bool,
62 # Minor regex replacements for cleanup in parse_simple_table()
63 "minor_text_cleanups": Optional[
64 dict[str, str]
65 ], # dict of {regex: substitution}
66 "articles_in_separate_columns": bool,
67 # Cells to ignore in this language, unless the cell has the key
68 # as a tag.
69 "conditionally_ignored_cells": dict[str, list[str]],
70 # dictionary, with the key being a tuple of POS strings so that
71 # nouns can have different remove patterns from verbs, etc.
72 "remove_text_patterns": dict[
73 tuple[str, ...], tuple[str | re.Pattern, ...]
74 ]
75 | None,
76 },
77 total=False,
78)
80lang_specific: dict[str, LangConfDict] = {
81 "default": {
82 "hdr_expand_first": set(
83 [
84 "number",
85 "mood",
86 "referent",
87 "aspect",
88 "tense",
89 "voice",
90 "non-finite",
91 "case",
92 "possession",
93 ]
94 ),
95 "hdr_expand_cont": set(
96 [
97 "person",
98 "gender",
99 "number",
100 "degree",
101 "polarity",
102 "voice",
103 "misc",
104 ]
105 ),
106 "animate_inanimate_remove": True,
107 "both_active_passive_remove": True,
108 "both_strong_weak_remove": True,
109 "definitenesses": ["indefinite", "definite"],
110 "empty_row_resets": False,
111 "form_transformations": [], # tag extraction, lang_specific_tags()
112 "genders": None,
113 "imperative_no_tense": False,
114 "masc_only_animate": False, # Slavic special
115 "numbers": ["singular", "plural"],
116 "persons": ["first-person", "second-person", "third-person"],
117 "pl_virile_nonvirile": False,
118 "reuse_cellspan": "skip", # stop/skip/reuse
119 "skip_mood_mood": False,
120 "skip_tense_tense": False,
121 "stop_non_finite_non_finite": True,
122 "stop_non_finite_voice": False,
123 "stop_non_finite_tense": False,
124 "strengths": ["strong", "weak"],
125 "virile_nonvirile_remove": True,
126 "voices": ["active", "passive"],
127 "special_phrase_splits": {}, # value: (split phrase, tags)
128 "form_replacements": {}, # value: [replacement, tags]
129 # Greek-style bracket semantics
130 "parentheses_for_informal": False,
131 "square_brackets_for_rare": False,
132 "curly_brackets_for_archaic": False,
133 # Armenian; migrated old data here
134 "lang_tag_mappings": None,
135 # Spanish has a lot of "vos" and "tú" in its tables that look like
136 # references, and they give their form certain tags.
137 # Dict of references ("vos") that point to tag strings "first-person
138 # singular" that *extend* tags.
139 "special_references": None,
140 # Some languages like Icelandic and Faroese have text cells in the
141 # upper left that we'd like to ignore.
142 "ignore_top_left_text_cell": False,
143 # Minor regex replacements for cleanup in parse_simple_table()
144 "minor_text_cleanups": None, # dict of {regex: substitution}
145 "articles_in_separate_columns": False,
146 # Cells to ignore in this language, unless the cell has the key
147 # as a tag.
148 "conditionally_ignored_cells": {},
149 "remove_text_patterns": None,
150 },
151 "austronesian-group": {
152 "numbers": ["singular", "dual", "plural"],
153 },
154 "bantu-group": {
155 "genders": None,
156 },
157 "indo-european-group": {
158 "genders": ["masculine", "feminine", "neuter"],
159 "numbers": ["singular", "plural"],
160 },
161 "romance-group": {},
162 "slavic-group": {
163 "numbers": ["singular", "plural", "dual"],
164 "masc_only_animate": True,
165 },
166 "samojedic-group": {
167 "next": "uralic-group",
168 },
169 "semitic-group": {
170 "numbers": ["singular", "dual", "plural"],
171 "definitenesses": ["indefinite", "definite", "construct"],
172 },
173 "uralic-group": {
174 "numbers": ["singular", "dual", "plural"],
175 },
176 "german-group": { # languages closely related to or offshot from German
177 "next": "germanic-group",
178 "articles_in_separate_columns": True,
179 },
180 "germanic-group": { # Germanic languages as a whole
181 "next": "indo-european-group",
182 },
183 "Akkadian": {
184 "next": "semitic-group",
185 },
186 "Alemannic German": {
187 "next": "German",
188 },
189 "Amharic": {
190 "next": "semitic-group",
191 },
192 "Ancient Greek": {
193 "next": "Proto-Indo-European", # Has dual
194 "remove_text_patterns": {
195 ("noun", "name"): (
196 # Used to remove the gendered article alternatives at the start
197 # of table entries like ἰχθυοκένταυρος / Ancient Greek
198 re.compile(
199 r"(?m)^(ā |ai |hā |hai |hē |ho |ho / hē |ho, hē |hoi |"
200 r"hoi / hai |hoi, hai |o |oi |tằ |tâ |taì |tâi |"
201 r"taîs |tân |tān |tān |tâs |tā̀s |têi |tēî |têisĭ |"
202 r"têisĭ |tḕn |tês |tò |tô |tṑ |tṑ |toi |toì |tôi |"
203 r"toîn |toîs |toîsĭ |toîsĭ\(n\) |toîsĭn |toîs / taîs |"
204 r"toîs, taîs |tôi, têi |tōî / tēî |tòn |tôn |"
205 r"tòn / tḕn |tòn, tḕn |tòs |tṑs |tṑs |toû |toùs |"
206 r"toùs / tā̀s |toùs, tā̀s |toû / tês |toû, tês )"
207 ),
208 # Main greek pattern
209 re.compile(
210 r"^(ᾱ |ᾱ̔ |αἰ |αἱ |ἡ |ὀ |ὁ |ὁ / ἡ |ὁ, ἡ |οἰ |οἱ |οἱ / αἱ |"
211 r"οἱ, αἱ |τᾰ̀ |τᾶ |τᾷ |ταὶ |ταῖς |τᾶν |τᾱν |τᾱν |τᾶς |τᾱ̀ς |"
212 r"τῇ |τὴν |τῆς |τῇσῐ |τῇσῐν |τὸ |τοι |τοὶ |τοῖ |τοῖν |"
213 r"τοῖς |"
214 r"τοῖσῐ / τοῖσῐν |τοῖς / ταῖς |τοῖς, ταῖς |τὸν |τὸν / τὴν |"
215 r"τὸν, τὴν |τὸς |τοῦ |τοὺς |τοὺς / τᾱ̀ς |τοὺς, τᾱ̀ς |"
216 r"τοῦ / τῆς |τοῦ, τῆς |τὼ |τῶ |τῷ |τῶν |τὼς |τὼς |"
217 r"τῷ / τῇ |τῷ, τῇ |τὼ )"
218 ),
219 ),
220 },
221 },
222 # "Anejom̃": {
223 # "numbers": ["singular", "dual", "trial", "plural"],
224 # },
225 "Arabic": {
226 "next": "semitic-group",
227 "numbers": [
228 "singular",
229 "dual",
230 "paucal",
231 "plural",
232 "collective",
233 "singulative",
234 ],
235 "reuse_cellspan": "reuse",
236 "hdr_expand_first": set(["number"]),
237 "hdr_expand_cont": set(
238 ["gender", "referent", "misc", "number", "class"]
239 ),
240 },
241 "Aragonese": {
242 "next": "romance-group",
243 },
244 "Armenian": {
245 "lang_tag_mappings": {
246 "noun": {
247 ("possessive", "singular"): ["possessive", "possessed-single"],
248 ("possessive", "plural"): ["possessive", "possessed-single"],
249 },
250 },
251 },
252 "Aromanian": {
253 "next": "romance-group",
254 },
255 "Aramaic": {
256 "next": "semitic-group",
257 },
258 "Avestan": {
259 "next": "Proto-Indo-European",
260 },
261 "Bavarian": {
262 "next": "German",
263 },
264 "Baiso": {
265 "numbers": ["singular", "paucal", "plural"],
266 },
267 "Belarusian": {
268 "next": "slavic-group",
269 },
270 "Bende": {
271 "next": "bantu-group",
272 },
273 # "Berber": {
274 # "definitenesses": ["indefinite", "definite", "construct"],
275 # },
276 "Catalan": {
277 "next": "romance-group",
278 },
279 "Chichewa": {
280 "next": "bantu-group",
281 },
282 "Chimwiini": {
283 "next": "bantu-group",
284 },
285 "Cimbrian": {
286 "next": "German",
287 },
288 "Corsican": {
289 "next": "romance-group",
290 },
291 "Czech": {
292 "next": "slavic-group",
293 "hdr_expand_first": set(["tense", "mood", "non-finite"]),
294 "hdr_expand_cont": set(["tense", "mood", "voice"]),
295 },
296 "Dalmatian": {
297 "next": "romance-group",
298 },
299 "Danish": {
300 "genders": ["common-gender", "feminine", "masculine", "neuter"],
301 "remove_text_patterns": {
302 # tuples need the comma to be happy
303 ("noun",): (re.compile(r"^\(as a measure\) "),),
304 },
305 },
306 "Eblaite": {
307 "next": "semitic-group",
308 },
309 "Egyptian": {
310 "definitenesses": ["indefinite", "definite", "construct"],
311 },
312 "Emilian": {
313 "next": "romance-group",
314 },
315 "English": {
316 "stop_non_finite_tense": True, # affect/English/Verb
317 "form_transformations": [
318 ("verb", r"^\(to\) ", "", ""),
319 ("verb", "^to ", "", ""),
320 ("verb", r"^I ", "", "first-person singular"),
321 ("verb", r"^you ", "", "second-person"),
322 ("verb", r"^he ", "", "third-person singular"),
323 ("verb", r"^we ", "", "first-person plural"),
324 ("verb", r"^they ", "", "third-person"),
325 ("verb", r"^it ", "", "third-person singular"),
326 ("verb", r"^thou ", "", "second-person singular"),
327 ("verb", r"^ye ", "", "second-person plural"),
328 ("verb", r" \(thou\)$", "", "second-person singular"),
329 ("verb", r" \(ye\)$", "", "second-person plural"),
330 ("verb", r"^he/she/it ", "", "third-person singular"),
331 ("verb", r"^he/she/it/they ", "", "third-person singular"),
332 ("verb", r"\bhim/her/it/them ", "", "third-person singular"),
333 ("verb", r"\bthem ", "", "third-person"),
334 ("verb", r"\bus ", "", "first-person plural"),
335 ("verb", r"\bme ", "", "first-person singular"),
336 ],
337 "form_replacements": {
338 "let’s be": ["let's be", "first-person plural pronoun-included"],
339 },
340 "special_phrase_splits": {
341 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"],
342 "we are (’re)/be/been": [
343 ["are (’re)", "be", "been"],
344 "first-person plural",
345 ],
346 "thou art (’rt)/beest": [
347 ["art (’rt)", "beest"],
348 "second-person singular",
349 ],
350 "ye are (’re)/be/been": [
351 ["are (’re)", "be", "been"],
352 "second-person plural",
353 ],
354 "thou be/beest": [["be", "beest"], "second-person singular"],
355 "he/she/it is (’s)/beeth/bes": [
356 ["is (’s)", "beeth", "bes"],
357 "third-person singular",
358 ],
359 "they are (’re)/be/been": [
360 ["are (’re)", "be", "been"],
361 "third-person plural",
362 ],
363 "thou wert/wast": [["wert", "wast"], "second-person singular"],
364 "thou were/wert": [["were", "wert"], "second-person singular"],
365 "there has been": [["there has been"], "singular"],
366 "there have been": [["there have been"], "plural"],
367 "there is ('s)": [["there is", "there's"], "singular"],
368 "there are ('re)": [["there are", "there're"], "plural"],
369 "there was": [["there was"], "singular"],
370 "there were": [["there were"], "plural"],
371 },
372 },
373 "Estonian": {
374 "hdr_expand_first": set(["non-finite"]),
375 "hdr_expand_cont": set(["voice"]),
376 },
377 "Faroese": {
378 "ignore_top_left_text_cell": True,
379 },
380 "Fijian": {
381 "numbers": ["singular", "paucal", "plural"],
382 },
383 "Finnish": {
384 "hdr_expand_first": set([]),
385 },
386 "French": {
387 "next": "romance-group",
388 },
389 "Friulian": {
390 "next": "romance-group",
391 },
392 "Galician": {
393 "next": "romance-group",
394 },
395 "German": {
396 "next": "german-group",
397 "form_transformations": [
398 ("verb", "^ich ", "", "first-person singular"),
399 ("verb", "^du ", "", "second-person singular"),
400 ("verb", "^er ", "", "third-person singular"),
401 ("verb", "^wir ", "", "first-person plural"),
402 ("verb", "^ihr ", "", "second-person plural"),
403 ("verb", "^sie ", "", "third-person plural"),
404 (
405 "verb",
406 "^dass ich ",
407 "",
408 "first-person singular subordinate-clause",
409 ),
410 (
411 "verb",
412 "^dass du ",
413 "",
414 "second-person singular subordinate-clause",
415 ),
416 (
417 "verb",
418 "^dass er ",
419 "",
420 "third-person singular subordinate-clause",
421 ),
422 (
423 "verb",
424 "^dass wir ",
425 "",
426 "first-person plural subordinate-clause",
427 ),
428 (
429 "verb",
430 "^dass ihr ",
431 "",
432 "second-person plural subordinate-clause",
433 ),
434 (
435 "verb",
436 "^dass sie ",
437 "",
438 "third-person plural subordinate-clause",
439 ),
440 ("verb", r" \(du\)$", "", "second-person singular"),
441 ("verb", r" \(ihr\)$", "", "second-person plural"),
442 ("adj", "^er ist ", "", "masculine singular"),
443 ("adj", "^sie ist ", "", "feminine singular"),
444 ("adj", "^es ist ", "", "neuter singular"),
445 ("adj", "^sie sind ", "", "plural"),
446 ("adj", "^keine ", "keine ", "negative"),
447 ("adj", "^keiner ", "keiner ", "negative"),
448 ("adj", "^keinen ", "keinen ", "negative"),
449 ],
450 "conditionally_ignored_cells": {
451 "definite": [
452 "der",
453 "die",
454 "das",
455 "des",
456 "dem",
457 "den",
458 ],
459 "indefinite": [
460 "ein",
461 "eine",
462 "eines",
463 "einer",
464 "einem",
465 "einen",
466 ],
467 "negative": [
468 "kein",
469 "keine",
470 "keiner",
471 "keinen",
472 ],
473 },
474 },
475 "German Low German": {
476 "next": "German",
477 "hdr_expand_first": set(["mood", "non-finite"]),
478 "hdr_expand_cont": set(["tense"]),
479 },
480 "Gothic": {
481 "next": "Proto-Indo-European", # Has dual
482 },
483 "Greek": {
484 "next": "indo-european-group",
485 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]),
486 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]),
487 "imperative_no_tense": True,
488 "reuse_cellspan": "reuse",
489 "skip_mood_mood": True,
490 "skip_tense_tense": True,
491 # είμαι/Greek
492 "parentheses_for_informal": True,
493 "square_brackets_for_rare": True,
494 "curly_brackets_for_archaic": True,
495 # For greek originally
496 "minor_text_cleanups": {
497 r"\s+➤\s*$": "",
498 },
499 },
500 "Hawaiian": {
501 "next": "austronesian-group",
502 },
503 "Hebrew": {
504 "next": "semitic-group",
505 },
506 "Hijazi Arabic": {
507 "next": "semitic-group",
508 },
509 "Hopi": {
510 "numbers": ["singular", "paucal", "plural"],
511 },
512 "Hungarian": {
513 "hdr_expand_first": set([]),
514 "hdr_expand_cont": set([]),
515 },
516 "Hunsrik": {
517 "next": "German",
518 },
519 "Icelandic": {
520 "ignore_top_left_text_cell": True,
521 },
522 "Ilokano": {
523 "next": "austronesian-group",
524 },
525 "Inari Sami": {
526 "next": "samojedic-group",
527 },
528 "Inuktitut": {
529 "numbers": ["singular", "dual", "plural"],
530 },
531 "Italian": {
532 "next": "romance-group",
533 "hdr_expand_first": set(["mood", "tense"]),
534 "hdr_expand_cont": set(["person", "register", "number", "misc"]),
535 "form_transformations": [
536 ("verb", "^non ", "", "negative"),
537 ],
538 },
539 "Irish": {
540 "next": "Old Irish",
541 "genders": ["masculine", "feminine"],
542 },
543 "Kamba": {
544 "next": "bantu-group",
545 },
546 "Kapampangan": {
547 "next": "austronesian-group",
548 },
549 # "Khoe": {
550 # "numbers": ["singular", "dual", "plural"],
551 # },
552 "Kikuyu": {
553 "next": "bantu-group",
554 },
555 "Ladin": {
556 "next": "romance-group",
557 },
558 # "Larike": {
559 # "numbers": ["singular", "dual", "trial", "plural"],
560 # },
561 "Latin": {
562 "next": "romance-group",
563 "stop_non_finite_voice": True,
564 },
565 "Latvian": {
566 "empty_row_resets": True,
567 },
568 "Ligurian": {
569 "next": "romance-group",
570 },
571 "Lihir": {
572 "numbers": ["singular", "dual", "trial", "paucal", "plural"],
573 },
574 "Lingala": {
575 "next": "bantu-group",
576 },
577 "Lombard": {
578 "next": "romance-group",
579 },
580 "Lower Sorbian": {
581 "next": "slavic-group",
582 },
583 "Luganda": {
584 "next": "bantu-group",
585 },
586 "Lule Sami": {
587 "next": "samojedic-group",
588 },
589 "Luxembourgish": {
590 "next": "German",
591 },
592 "Maltese": {
593 "next": "semitic-group",
594 },
595 "Maore Comorian": {
596 "next": "bantu-group",
597 },
598 "Masaba": {
599 "next": "bantu-group",
600 },
601 "Mirandese": {
602 "next": "romance-group",
603 },
604 "Moroccan Arabic": {
605 "next": "semitic-group",
606 },
607 # "Motuna": {
608 # "numbers": ["singular", "paucal", "plural"],
609 # },
610 "Mwali Comorian": {
611 "next": "bantu-group",
612 },
613 "Mwani": {
614 "next": "bantu-group",
615 },
616 "Navajo": {
617 "numbers": [
618 "singular",
619 "plural",
620 "dual",
621 "duoplural",
622 ],
623 },
624 "Neapolitan": {
625 "next": "romance-group",
626 },
627 "Nenets": {
628 "next": "uralic-group",
629 },
630 "Ngazidja Comorian": {
631 "next": "bantu-group",
632 },
633 "Niuean": {
634 "next": "austronesian-group",
635 },
636 "Northern Kurdish": {
637 "numbers": ["singular", "paucal", "plural"],
638 },
639 "Northern Ndebele": {
640 "next": "bantu-group",
641 },
642 "Northern Sami": {
643 "next": "samojedic-group",
644 },
645 # "Mussau": {
646 # "numbers": ["singular", "dual", "trial", "plural"],
647 # },
648 "Nyankole": {
649 "next": "bantu-group",
650 },
651 "Occitan": {
652 "next": "romance-group",
653 },
654 "Old Church Slavonic": {
655 "next": "Proto-Indo-European", # Has dual
656 },
657 "Old English": {
658 "next": "Proto-Indo-European", # Had dual in pronouns
659 },
660 "Old Norse": {
661 "next": "Proto-Indo-European", # Had dual in pronouns
662 },
663 "Old Irish": {
664 "next": "Proto-Indo-European", # Has dual
665 },
666 "Pennsylvania German": {
667 "next": "German",
668 },
669 "Phoenician": {
670 "next": "semitic-group",
671 },
672 "Phuthi": {
673 "next": "bantu-group",
674 },
675 "Pite Sami": {
676 "next": "samojedic-group",
677 },
678 "Polish": {
679 "next": "slavic-group",
680 },
681 "Portuguese": {
682 "next": "romance-group",
683 "genders": ["masculine", "feminine"],
684 },
685 "Proto-Germanic": {
686 "next": "Proto-Indo-European", # Has dual
687 },
688 "Proto-Indo-European": {
689 "numbers": ["singular", "dual", "plural"],
690 },
691 "Proto-Samic": {
692 "next": "samojedic-group",
693 },
694 "Proto-Uralic": {
695 "next": "uralic-group",
696 },
697 "Raga": {
698 "numbers": ["singular", "dual", "trial", "plural"],
699 },
700 "Romagnol": {
701 "next": "romance-group",
702 },
703 "Romanian": {
704 "next": "romance-group",
705 },
706 "Romansch": {
707 "next": "romance-group",
708 },
709 "Russian": {
710 "next": "slavic-group",
711 "hdr_expand_first": set(["non-finite", "mood", "tense"]),
712 "hdr_expand_cont": set(["tense", "number"]),
713 "reuse_cellspan": "stop",
714 },
715 "Rwanda-Rundi": {
716 "next": "bantu-group",
717 },
718 "Sanskrit": {
719 "next": "Proto-Indo-European",
720 },
721 "Sardinian": {
722 "next": "romance-group",
723 },
724 "Sassarese": {
725 "next": "romance-group",
726 },
727 "Scottish Gaelic": {
728 "numbers": ["singular", "dual", "plural"],
729 },
730 "Serbo-Croatian": {
731 "next": "slavic-group",
732 "numbers": ["singular", "dual", "paucal", "plural"],
733 },
734 "Sicilian": {
735 "next": "romance-group",
736 },
737 "Skolt Sami": {
738 "next": "samojedic-group",
739 },
740 "Slovene": {
741 "next": "slavic-group",
742 },
743 "Shona": {
744 "next": "bantu-group",
745 },
746 "Sotho": {
747 "next": "bantu-group",
748 },
749 "South Levantine Arabic": {
750 "next": "semitic-group",
751 },
752 "Southern Ndebele": {
753 "next": "bantu-group",
754 },
755 "Spanish": {
756 "next": "romance-group",
757 "form_transformations": [
758 ("verb", "^no ", "", "negative"),
759 ],
760 "special_references": {
761 "vos": "informal vos-form second-person singular",
762 "ᵛᵒˢ": "informal vos-form second-person singular",
763 "tú": "informal second-person singular",
764 },
765 },
766 "Swahili": {
767 "next": "bantu-group",
768 },
769 "Swedish": {
770 "hdr_expand_first": set(["referent"]),
771 "hdr_expand_cont": set(["degree", "polarity"]),
772 "genders": ["common-gender", "feminine", "masculine", "neuter"],
773 },
774 "Swazi": {
775 "next": "bantu-group",
776 },
777 # "Syriac": {
778 # "next": "semitic-group",
779 # },
780 "Tagalog": {
781 "next": "austronesian-group",
782 },
783 "Tausug": {
784 "next": "austronesian-group",
785 },
786 "Tigre": {
787 "next": "semitic-group",
788 },
789 "Tigrinya": {
790 "next": "semitic-group",
791 },
792 "Tongan": {
793 "next": "austronesian-group",
794 },
795 "Tsonga": {
796 "next": "bantu-group",
797 },
798 "Tswana": {
799 "next": "bantu-group",
800 },
801 "Tumbuka": {
802 "next": "bantu-group",
803 },
804 # "Tuscan": {
805 # "next": "romance-group",
806 # },
807 "Ugaritic": {
808 "next": "semitic-group",
809 },
810 "Ukrainian": {
811 "next": "slavic-group",
812 },
813 "Upper Sorbian": {
814 "next": "slavic-group",
815 },
816 # "Valencian": {
817 # "next": "romance-group",
818 # },
819 "Venetian": {
820 "next": "romance-group",
821 },
822 "Warlpiri": {
823 "numbers": ["singular", "paucal", "plural"],
824 },
825 "Xhosa": {
826 "next": "bantu-group",
827 },
828 "Zulu": {
829 "next": "bantu-group",
830 },
831 "ǃXóõ": {
832 "next": "bantu-group",
833 },
834}
837# Sanity check lang_specific
838# def_ls_keys = lang_specific["default"].keys()
839# for k, v in lang_specific.items():
840# if k[0].isupper() and k not in languages_by_name:
841# raise AssertionError(
842# "key {!r} in lang_specific is not a valid language"
843# .format(k))
844# assert isinstance(v, dict)
845# for kk, vv in v.items():
846# if kk not in def_ls_keys and kk != "next":
847# raise AssertionError("{} key {!r} not in default entry"
848# .format(k, kk))
849# if kk in ("hdr_expand_first", "hdr_expand_cont"):
850# if not isinstance(vv, set):
851# raise AssertionError("{} key {!r} must be set"
852# .format(lang, kk))
853# for t in vv:
854# if t not in tag_categories:
855# raise AssertionError("{} key {!r} invalid tag category {}"
856# .format(k, kk, t))
857# elif kk in ("genders", "numbers", "persons", "strengths", "voices"):
858# if not vv:
859# continue
860# if not isinstance(vv, (list, tuple, set)):
861# raise AssertionError("{} key {!r} must be list/tuple/set"
862# .format(k, kk))
863# for t in vv:
864# if t not in valid_tags:
865# raise AssertionError("{} key {!r} invalid tag {!r}"
866# .format(k, kk, t))
867# elif kk == "lang_tag_mappings" and vv is not None:
868# for pos, transf in vv.items():
869# assert pos in PARTS_OF_SPEECH
870# assert isinstance(transf, dict)
871# for pre, post in transf.items():
872# assert isinstance(pre, tuple)
873# assert all(t in valid_tags for t in pre)
874# assert isinstance(post, list)
875# assert all(t in valid_tags for t in post)
876# elif kk == "next":
877# if vv not in lang_specific:
878# raise AssertionError("{} key {!r} value {!r} is not defined"
879# .format(k, kk, vv))
882def get_lang_conf(lang, field):
883 """Returns the given field from language-specific data or "default"
884 if the language is not listed or does not have the field."""
885 assert isinstance(lang, str)
886 assert isinstance(field, str)
887 while True:
888 lconfigs = lang_specific.get(lang)
889 if lconfigs is None:
890 lang = "default"
891 elif lang == "default" and field not in lconfigs: 891 ↛ 892line 891 didn't jump to line 892 because the condition on line 891 was never true
892 raise RuntimeError("Invalid lang_specific field {!r}".format(field))
893 else:
894 if field in lconfigs:
895 return lconfigs[field]
896 lang = lconfigs.get("next", "default")
899def lang_specific_tags(lang, pos, form):
900 """Extracts tags from the word form itself in a language-specific way.
901 This may also adjust the word form.
902 For example, German inflected verb forms don't have person and number
903 specified in the table, but include a pronoun. This returns adjusted
904 form and a list of tags."""
905 assert isinstance(lang, str)
906 assert isinstance(pos, str)
907 assert isinstance(form, str)
908 rules = get_lang_conf(lang, "form_transformations")
909 for patpos, pattern, dst, tags in rules:
910 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > ""
911 if isinstance(patpos, tuple): 911 ↛ 912line 911 didn't jump to line 912 because the condition on line 911 was never true
912 for p in patpos:
913 assert p in PARTS_OF_SPEECH
914 if pos not in patpos:
915 continue
916 else:
917 assert patpos in PARTS_OF_SPEECH
918 if pos != patpos:
919 continue
920 m = re.search(pattern, form)
921 if not m:
922 continue
923 form = form[: m.start()] + dst + form[m.end() :]
924 tags = tags.split()
925 for t in tags:
926 assert t in valid_tags
927 return form, tags
928 return form, []