Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 96%
36 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1# Language-specific configuration for various aspects of inflection table
2# parsing.
4import re
5from typing import Optional, TypedDict, Union
7from ...tags import valid_tags
8from .parts_of_speech import PARTS_OF_SPEECH
10LangConfDict = TypedDict(
11 "LangConfDict",
12 {
13 "next": str,
14 "hdr_expand_first": set[str],
15 "hdr_expand_cont": set[str],
16 "animate_inanimate_remove": bool,
17 "both_active_passive_remove": bool,
18 "both_strong_weak_remove": bool,
19 "definitenesses": list[str],
20 "empty_row_resets": bool,
21 "form_transformations": list[
22 list[str]
23 ], # tag extraction, lang_specific_tags()
24 "genders": Optional[list[str]],
25 "imperative_no_tense": bool,
26 "masc_only_animate": bool, # Slavic special
27 "numbers": list[str],
28 "persons": list[str],
29 "pl_virile_nonvirile": bool,
30 "reuse_cellspan": str, # stop/skip/reuse
31 "skip_mood_mood": bool,
32 "skip_tense_tense": bool,
33 "stop_non_finite_non_finite": bool,
34 "stop_non_finite_voice": bool,
35 "stop_non_finite_tense": bool,
36 "strengths": list[str],
37 "virile_nonvirile_remove": bool,
38 "voices": list[str],
39 "special_phrase_splits": dict[
40 str, list[Union[list[str], str]]
41 ], # value: (split phrase, tags)
42 "form_replacements": dict[
43 str, Union[str, list[str]]
44 ], # value: [replacement, tags]
45 # Greek-style bracket semantics
46 "parentheses_for_informal": bool,
47 "square_brackets_for_rare": bool,
48 "curly_brackets_for_archaic": bool,
49 # Armenian; migrated old data here
50 "lang_tag_mappings": Optional[
51 dict[str, dict[tuple[str, ...], list[str]]]
52 ],
53 # Spanish has a lot of "vos" and "tú" in its tables that look like
54 # references, and they give their form certain tags.
55 # Dict of references ("vos") that point to tag strings "first-person
56 # singular" that *extend* tags.
57 "special_references": Optional[dict[str, str]],
58 # Some languages like Icelandic and Faroese have text cells in the
59 # upper left that we'd like to ignore.
60 "ignore_top_left_text_cell": bool,
61 # Minor regex replacements for cleanup in parse_simple_table()
62 "minor_text_cleanups": Optional[
63 dict[str, str]
64 ], # dict of {regex: substitution}
65 "articles_in_separate_columns": bool,
66 # Cells to ignore in this language, unless the cell has the key
67 # as a tag.
68 "conditionally_ignored_cells": dict[str, list[str]],
69 },
70 total=False,
71)
73lang_specific: dict[str, LangConfDict] = {
74 "default": {
75 "hdr_expand_first": set(
76 [
77 "number",
78 "mood",
79 "referent",
80 "aspect",
81 "tense",
82 "voice",
83 "non-finite",
84 "case",
85 "possession",
86 ]
87 ),
88 "hdr_expand_cont": set(
89 [
90 "person",
91 "gender",
92 "number",
93 "degree",
94 "polarity",
95 "voice",
96 "misc",
97 ]
98 ),
99 "animate_inanimate_remove": True,
100 "both_active_passive_remove": True,
101 "both_strong_weak_remove": True,
102 "definitenesses": ["indefinite", "definite"],
103 "empty_row_resets": False,
104 "form_transformations": [], # tag extraction, lang_specific_tags()
105 "genders": None,
106 "imperative_no_tense": False,
107 "masc_only_animate": False, # Slavic special
108 "numbers": ["singular", "plural"],
109 "persons": ["first-person", "second-person", "third-person"],
110 "pl_virile_nonvirile": False,
111 "reuse_cellspan": "skip", # stop/skip/reuse
112 "skip_mood_mood": False,
113 "skip_tense_tense": False,
114 "stop_non_finite_non_finite": True,
115 "stop_non_finite_voice": False,
116 "stop_non_finite_tense": False,
117 "strengths": ["strong", "weak"],
118 "virile_nonvirile_remove": True,
119 "voices": ["active", "passive"],
120 "special_phrase_splits": {}, # value: (split phrase, tags)
121 "form_replacements": {}, # value: [replacement, tags]
122 # Greek-style bracket semantics
123 "parentheses_for_informal": False,
124 "square_brackets_for_rare": False,
125 "curly_brackets_for_archaic": False,
126 # Armenian; migrated old data here
127 "lang_tag_mappings": None,
128 # Spanish has a lot of "vos" and "tú" in its tables that look like
129 # references, and they give their form certain tags.
130 # Dict of references ("vos") that point to tag strings "first-person
131 # singular" that *extend* tags.
132 "special_references": None,
133 # Some languages like Icelandic and Faroese have text cells in the
134 # upper left that we'd like to ignore.
135 "ignore_top_left_text_cell": False,
136 # Minor regex replacements for cleanup in parse_simple_table()
137 "minor_text_cleanups": None, # dict of {regex: substitution}
138 "articles_in_separate_columns": False,
139 # Cells to ignore in this language, unless the cell has the key
140 # as a tag.
141 "conditionally_ignored_cells": {},
142 },
143 "austronesian-group": {
144 "numbers": ["singular", "dual", "plural"],
145 },
146 "bantu-group": {
147 "genders": None,
148 },
149 "indo-european-group": {
150 "genders": ["masculine", "feminine", "neuter"],
151 "numbers": ["singular", "plural"],
152 },
153 "romance-group": {},
154 "slavic-group": {
155 "numbers": ["singular", "plural", "dual"],
156 "masc_only_animate": True,
157 },
158 "samojedic-group": {
159 "next": "uralic-group",
160 },
161 "semitic-group": {
162 "numbers": ["singular", "dual", "plural"],
163 "definitenesses": ["indefinite", "definite", "construct"],
164 },
165 "uralic-group": {
166 "numbers": ["singular", "dual", "plural"],
167 },
168 "german-group": { # languages closely related to or offshot from German
169 "next": "germanic-group",
170 "articles_in_separate_columns": True,
171 },
172 "germanic-group": { # Germanic languages as a whole
173 "next": "indo-european-group",
174 },
175 "Akkadian": {
176 "next": "semitic-group",
177 },
178 "Alemannic German": {
179 "next": "German",
180 },
181 "Amharic": {
182 "next": "semitic-group",
183 },
184 "Ancient Greek": {
185 "next": "Proto-Indo-European", # Has dual
186 "form_transformations": [
187 # Used to remove the gendered article alternatives at the start
188 # of table entries like ἰχθυοκένταυρος / Ancient Greek
189 ["noun", "^ὁ, ἡ ", "", ""],
190 ["noun", "^τὼ ", "", ""],
191 ["noun", "^οἱ, αἱ ", "", ""],
192 ["noun", "^τοῦ, τῆς ", "", ""],
193 ["noun", "^τοῖν ", "", ""],
194 ["noun", "^τῶν ", "", ""],
195 ["noun", "^τῷ, τῇ ", "", ""],
196 ["noun", "^τοῖς, ταῖς ", "", ""],
197 ["noun", "^τὸν, τὴν ", "", ""],
198 ["noun", "^τὼ ", "", ""],
199 ["noun", "^τοὺς, τᾱ̀ς ", "", ""],
200 ["noun", "(?m)^ho, hē ", "", ""],
201 ["noun", "(?m)^tṑ ", "", ""],
202 ["noun", "(?m)^hoi, hai ", "", ""],
203 ["noun", "(?m)^toû, tês ", "", ""],
204 ["noun", "(?m)^toîn ", "", ""],
205 ["noun", "(?m)^tôn ", "", ""],
206 ["noun", "(?m)^tôi, têi ", "", ""],
207 ["noun", "(?m)^toîs, taîs ", "", ""],
208 ["noun", "(?m)^tòn, tḕn ", "", ""],
209 ["noun", "(?m)^tṑ ", "", ""],
210 ["noun", "(?m)^toùs, tā̀s ", "", ""],
211 # New added ones, leaving the old ones just in case
212 ["noun", r"^ὁ ", "", ""],
213 ["noun", r"^ἡ ", "", ""],
214 ["noun", r"(?m)^hē ", "", ""],
215 ["noun", r"^αἱ ", "", ""],
216 ["noun", r"(?m)^hai ", "", ""],
217 ["noun", r"^τῆς ", "", ""],
218 ["noun", r"(?m)^tês ", "", ""],
219 ["noun", r"^τῇ ", "", ""],
220 ["noun", r"(?m)^tēî ", "", ""],
221 ["noun", r"^ταῖς ", "", ""],
222 ["noun", r"(?m)^taîs ", "", ""],
223 ["noun", r"^τὴν ", "", ""],
224 ["noun", r"(?m)^tḕn ", "", ""],
225 ["noun", r"^τᾱ̀ς ", "", ""],
226 ["noun", r"(?m)^tā̀s ", "", ""],
227 ["noun", r"^ὁ / ἡ ", "", ""],
228 ["noun", r"(?m)^ho / hē ", "", ""],
229 ["noun", r"^οἱ / αἱ ", "", ""],
230 ["noun", r"(?m)^hoi / hai ", "", ""],
231 ["noun", r"^τοῦ / τῆς ", "", ""],
232 ["noun", r"(?m)^toû / tês ", "", ""],
233 ["noun", r"^τῷ / τῇ ", "", ""],
234 ["noun", r"(?m)^tōî / tēî ", "", ""],
235 ["noun", r"^τοῖς / ταῖς ", "", ""],
236 ["noun", r"(?m)^toîs / taîs ", "", ""],
237 ["noun", r"^τὸν / τὴν ", "", ""],
238 ["noun", r"(?m)^tòn / tḕn ", "", ""],
239 ["noun", r"^τοὺς / τᾱ̀ς ", "", ""],
240 ["noun", r"(?m)^toùs / tā̀s ", "", ""],
241 ["noun", r"^οἱ ", "", ""],
242 ["noun", r"(?m)^hoi ", "", ""],
243 ["noun", r"^τοῦ ", "", ""],
244 ["noun", r"(?m)^toû ", "", ""],
245 ["noun", r"^τῷ ", "", ""],
246 ["noun", r"(?m)^tôi ", "", ""],
247 ["noun", r"^τοῖς ", "", ""],
248 ["noun", r"(?m)^toîs ", "", ""],
249 ["noun", r"^τὸν ", "", ""],
250 ["noun", r"(?m)^τὸν ", "", ""],
251 ["noun", r"^τοὺς ", "", ""],
252 ["noun", r"(?m)^toùs ", "", ""],
253 ["noun", r"^τὸ ", "", ""],
254 ["noun", r"(?m)^tò ", "", ""],
255 ["noun", r"^τᾰ̀ ", "", ""],
256 ["noun", r"(?m)^tằ ", "", ""],
257 ["noun", r"^τοῖσῐ / τοῖσῐν ", "", ""],
258 # XXX THIS IS BAD, IF POSSIBLE FIX, ISSUE #1313
259 ["noun", r"(?m)^toîsĭ\(n\) ", "", ""],
260 ["noun", r"(?m)^toîsĭ ", "", ""],
261 ["noun", r"(?m)^toîsĭn ", "", ""],
262 # END BAD
263 # ["noun", r"^", "", ""],
264 # ["noun", r"(?m)^", "", ""],
265 ],
266 },
267 # "Anejom̃": {
268 # "numbers": ["singular", "dual", "trial", "plural"],
269 # },
270 "Arabic": {
271 "next": "semitic-group",
272 "numbers": [
273 "singular",
274 "dual",
275 "paucal",
276 "plural",
277 "collective",
278 "singulative",
279 ],
280 "reuse_cellspan": "reuse",
281 "hdr_expand_first": set(["number"]),
282 "hdr_expand_cont": set(
283 ["gender", "referent", "misc", "number", "class"]
284 ),
285 },
286 "Aragonese": {
287 "next": "romance-group",
288 },
289 "Armenian": {
290 "lang_tag_mappings": {
291 "noun": {
292 ("possessive", "singular"): ["possessive", "possessed-single"],
293 ("possessive", "plural"): ["possessive", "possessed-single"],
294 },
295 },
296 },
297 "Aromanian": {
298 "next": "romance-group",
299 },
300 "Aramaic": {
301 "next": "semitic-group",
302 },
303 "Avestan": {
304 "next": "Proto-Indo-European",
305 },
306 "Bavarian": {
307 "next": "German",
308 },
309 "Baiso": {
310 "numbers": ["singular", "paucal", "plural"],
311 },
312 "Belarusian": {
313 "next": "slavic-group",
314 },
315 "Bende": {
316 "next": "bantu-group",
317 },
318 # "Berber": {
319 # "definitenesses": ["indefinite", "definite", "construct"],
320 # },
321 "Catalan": {
322 "next": "romance-group",
323 },
324 "Chichewa": {
325 "next": "bantu-group",
326 },
327 "Chimwiini": {
328 "next": "bantu-group",
329 },
330 "Cimbrian": {
331 "next": "German",
332 },
333 "Corsican": {
334 "next": "romance-group",
335 },
336 "Czech": {
337 "next": "slavic-group",
338 "hdr_expand_first": set(["tense", "mood", "non-finite"]),
339 "hdr_expand_cont": set(["tense", "mood", "voice"]),
340 },
341 "Dalmatian": {
342 "next": "romance-group",
343 },
344 "Danish": {
345 "genders": ["common-gender", "feminine", "masculine", "neuter"],
346 "form_transformations": [
347 ["noun", r"^\(as a measure\) ", "", ""],
348 ],
349 },
350 "Eblaite": {
351 "next": "semitic-group",
352 },
353 "Egyptian": {
354 "definitenesses": ["indefinite", "definite", "construct"],
355 },
356 "Emilian": {
357 "next": "romance-group",
358 },
359 "English": {
360 "stop_non_finite_tense": True, # affect/English/Verb
361 "form_transformations": [
362 ["verb", r"^\(to\) ", "", ""],
363 ["verb", "^to ", "", ""],
364 ["verb", r"^I ", "", "first-person singular"],
365 ["verb", r"^you ", "", "second-person"],
366 ["verb", r"^he ", "", "third-person singular"],
367 ["verb", r"^we ", "", "first-person plural"],
368 ["verb", r"^they ", "", "third-person"],
369 ["verb", r"^it ", "", "third-person singular"],
370 ["verb", r"^thou ", "", "second-person singular"],
371 ["verb", r"^ye ", "", "second-person plural"],
372 ["verb", r" \(thou\)$", "", "second-person singular"],
373 ["verb", r" \(ye\)$", "", "second-person plural"],
374 ["verb", r"^he/she/it ", "", "third-person singular"],
375 ["verb", r"^he/she/it/they ", "", "third-person singular"],
376 ["verb", r"\bhim/her/it/them ", "", "third-person singular"],
377 ["verb", r"\bthem ", "", "third-person"],
378 ["verb", r"\bus ", "", "first-person plural"],
379 ["verb", r"\bme ", "", "first-person singular"],
380 ],
381 "form_replacements": {
382 "let’s be": ["let's be", "first-person plural pronoun-included"],
383 },
384 "special_phrase_splits": {
385 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"],
386 "we are (’re)/be/been": [
387 ["are (’re)", "be", "been"],
388 "first-person plural",
389 ],
390 "thou art (’rt)/beest": [
391 ["art (’rt)", "beest"],
392 "second-person singular",
393 ],
394 "ye are (’re)/be/been": [
395 ["are (’re)", "be", "been"],
396 "second-person plural",
397 ],
398 "thou be/beest": [["be", "beest"], "second-person singular"],
399 "he/she/it is (’s)/beeth/bes": [
400 ["is (’s)", "beeth", "bes"],
401 "third-person singular",
402 ],
403 "they are (’re)/be/been": [
404 ["are (’re)", "be", "been"],
405 "third-person plural",
406 ],
407 "thou wert/wast": [["wert", "wast"], "second-person singular"],
408 "thou were/wert": [["were", "wert"], "second-person singular"],
409 "there has been": [["there has been"], "singular"],
410 "there have been": [["there have been"], "plural"],
411 "there is ('s)": [["there is", "there's"], "singular"],
412 "there are ('re)": [["there are", "there're"], "plural"],
413 "there was": [["there was"], "singular"],
414 "there were": [["there were"], "plural"],
415 },
416 },
417 "Estonian": {
418 "hdr_expand_first": set(["non-finite"]),
419 "hdr_expand_cont": set(["voice"]),
420 },
421 "Faroese": {
422 "ignore_top_left_text_cell": True,
423 },
424 "Fijian": {
425 "numbers": ["singular", "paucal", "plural"],
426 },
427 "Finnish": {
428 "hdr_expand_first": set([]),
429 },
430 "French": {
431 "next": "romance-group",
432 },
433 "Friulian": {
434 "next": "romance-group",
435 },
436 "Galician": {
437 "next": "romance-group",
438 },
439 "German": {
440 "next": "german-group",
441 "form_transformations": [
442 ["verb", "^ich ", "", "first-person singular"],
443 ["verb", "^du ", "", "second-person singular"],
444 ["verb", "^er ", "", "third-person singular"],
445 ["verb", "^wir ", "", "first-person plural"],
446 ["verb", "^ihr ", "", "second-person plural"],
447 ["verb", "^sie ", "", "third-person plural"],
448 [
449 "verb",
450 "^dass ich ",
451 "",
452 "first-person singular subordinate-clause",
453 ],
454 [
455 "verb",
456 "^dass du ",
457 "",
458 "second-person singular subordinate-clause",
459 ],
460 [
461 "verb",
462 "^dass er ",
463 "",
464 "third-person singular subordinate-clause",
465 ],
466 [
467 "verb",
468 "^dass wir ",
469 "",
470 "first-person plural subordinate-clause",
471 ],
472 [
473 "verb",
474 "^dass ihr ",
475 "",
476 "second-person plural subordinate-clause",
477 ],
478 [
479 "verb",
480 "^dass sie ",
481 "",
482 "third-person plural subordinate-clause",
483 ],
484 ["verb", r" \(du\)$", "", "second-person singular"],
485 ["verb", r" \(ihr\)$", "", "second-person plural"],
486 ["adj", "^er ist ", "", "masculine singular"],
487 ["adj", "^sie ist ", "", "feminine singular"],
488 ["adj", "^es ist ", "", "neuter singular"],
489 ["adj", "^sie sind ", "", "plural"],
490 ["adj", "^keine ", "keine ", "negative"],
491 ["adj", "^keiner ", "keiner ", "negative"],
492 ["adj", "^keinen ", "keinen ", "negative"],
493 ],
494 "conditionally_ignored_cells": {
495 "definite": [
496 "der",
497 "die",
498 "das",
499 "des",
500 "dem",
501 "den",
502 ],
503 "indefinite": [
504 "ein",
505 "eine",
506 "eines",
507 "einer",
508 "einem",
509 "einen",
510 ],
511 "negative": [
512 "kein",
513 "keine",
514 "keiner",
515 "keinen",
516 ],
517 },
518 },
519 "German Low German": {
520 "next": "German",
521 "hdr_expand_first": set(["mood", "non-finite"]),
522 "hdr_expand_cont": set(["tense"]),
523 },
524 "Gothic": {
525 "next": "Proto-Indo-European", # Has dual
526 },
527 "Greek": {
528 "next": "indo-european-group",
529 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]),
530 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]),
531 "imperative_no_tense": True,
532 "reuse_cellspan": "reuse",
533 "skip_mood_mood": True,
534 "skip_tense_tense": True,
535 # είμαι/Greek
536 "parentheses_for_informal": True,
537 "square_brackets_for_rare": True,
538 "curly_brackets_for_archaic": True,
539 # For greek originally
540 "minor_text_cleanups": {
541 r"\s+➤\s*$": "",
542 },
543 },
544 "Hawaiian": {
545 "next": "austronesian-group",
546 },
547 "Hebrew": {
548 "next": "semitic-group",
549 },
550 "Hijazi Arabic": {
551 "next": "semitic-group",
552 },
553 "Hopi": {
554 "numbers": ["singular", "paucal", "plural"],
555 },
556 "Hungarian": {
557 "hdr_expand_first": set([]),
558 "hdr_expand_cont": set([]),
559 },
560 "Hunsrik": {
561 "next": "German",
562 },
563 "Icelandic": {
564 "ignore_top_left_text_cell": True,
565 },
566 "Ilokano": {
567 "next": "austronesian-group",
568 },
569 "Inari Sami": {
570 "next": "samojedic-group",
571 },
572 "Inuktitut": {
573 "numbers": ["singular", "dual", "plural"],
574 },
575 "Italian": {
576 "next": "romance-group",
577 "hdr_expand_first": set(["mood", "tense"]),
578 "hdr_expand_cont": set(["person", "register", "number", "misc"]),
579 "form_transformations": [
580 ["verb", "^non ", "", "negative"],
581 ],
582 },
583 "Irish": {
584 "next": "Old Irish",
585 "genders": ["masculine", "feminine"],
586 },
587 "Kamba": {
588 "next": "bantu-group",
589 },
590 "Kapampangan": {
591 "next": "austronesian-group",
592 },
593 # "Khoe": {
594 # "numbers": ["singular", "dual", "plural"],
595 # },
596 "Kikuyu": {
597 "next": "bantu-group",
598 },
599 "Ladin": {
600 "next": "romance-group",
601 },
602 # "Larike": {
603 # "numbers": ["singular", "dual", "trial", "plural"],
604 # },
605 "Latin": {
606 "next": "romance-group",
607 "stop_non_finite_voice": True,
608 },
609 "Latvian": {
610 "empty_row_resets": True,
611 },
612 "Ligurian": {
613 "next": "romance-group",
614 },
615 "Lihir": {
616 "numbers": ["singular", "dual", "trial", "paucal", "plural"],
617 },
618 "Lingala": {
619 "next": "bantu-group",
620 },
621 "Lombard": {
622 "next": "romance-group",
623 },
624 "Lower Sorbian": {
625 "next": "slavic-group",
626 },
627 "Luganda": {
628 "next": "bantu-group",
629 },
630 "Lule Sami": {
631 "next": "samojedic-group",
632 },
633 "Luxembourgish": {
634 "next": "German",
635 },
636 "Maltese": {
637 "next": "semitic-group",
638 },
639 "Maore Comorian": {
640 "next": "bantu-group",
641 },
642 "Masaba": {
643 "next": "bantu-group",
644 },
645 "Mirandese": {
646 "next": "romance-group",
647 },
648 "Moroccan Arabic": {
649 "next": "semitic-group",
650 },
651 # "Motuna": {
652 # "numbers": ["singular", "paucal", "plural"],
653 # },
654 "Mwali Comorian": {
655 "next": "bantu-group",
656 },
657 "Mwani": {
658 "next": "bantu-group",
659 },
660 "Navajo": {
661 "numbers": [
662 "singular",
663 "plural",
664 "dual",
665 "duoplural",
666 ],
667 },
668 "Neapolitan": {
669 "next": "romance-group",
670 },
671 "Nenets": {
672 "next": "uralic-group",
673 },
674 "Ngazidja Comorian": {
675 "next": "bantu-group",
676 },
677 "Niuean": {
678 "next": "austronesian-group",
679 },
680 "Northern Kurdish": {
681 "numbers": ["singular", "paucal", "plural"],
682 },
683 "Northern Ndebele": {
684 "next": "bantu-group",
685 },
686 "Northern Sami": {
687 "next": "samojedic-group",
688 },
689 # "Mussau": {
690 # "numbers": ["singular", "dual", "trial", "plural"],
691 # },
692 "Nyankole": {
693 "next": "bantu-group",
694 },
695 "Occitan": {
696 "next": "romance-group",
697 },
698 "Old Church Slavonic": {
699 "next": "Proto-Indo-European", # Has dual
700 },
701 "Old English": {
702 "next": "Proto-Indo-European", # Had dual in pronouns
703 },
704 "Old Norse": {
705 "next": "Proto-Indo-European", # Had dual in pronouns
706 },
707 "Old Irish": {
708 "next": "Proto-Indo-European", # Has dual
709 },
710 "Pennsylvania German": {
711 "next": "German",
712 },
713 "Phoenician": {
714 "next": "semitic-group",
715 },
716 "Phuthi": {
717 "next": "bantu-group",
718 },
719 "Pite Sami": {
720 "next": "samojedic-group",
721 },
722 "Polish": {
723 "next": "slavic-group",
724 },
725 "Portuguese": {
726 "next": "romance-group",
727 "genders": ["masculine", "feminine"],
728 },
729 "Proto-Germanic": {
730 "next": "Proto-Indo-European", # Has dual
731 },
732 "Proto-Indo-European": {
733 "numbers": ["singular", "dual", "plural"],
734 },
735 "Proto-Samic": {
736 "next": "samojedic-group",
737 },
738 "Proto-Uralic": {
739 "next": "uralic-group",
740 },
741 "Raga": {
742 "numbers": ["singular", "dual", "trial", "plural"],
743 },
744 "Romagnol": {
745 "next": "romance-group",
746 },
747 "Romanian": {
748 "next": "romance-group",
749 },
750 "Romansch": {
751 "next": "romance-group",
752 },
753 "Russian": {
754 "next": "slavic-group",
755 "hdr_expand_first": set(["non-finite", "mood", "tense"]),
756 "hdr_expand_cont": set(["tense", "number"]),
757 "reuse_cellspan": "stop",
758 },
759 "Rwanda-Rundi": {
760 "next": "bantu-group",
761 },
762 "Sanskrit": {
763 "next": "Proto-Indo-European",
764 },
765 "Sardinian": {
766 "next": "romance-group",
767 },
768 "Sassarese": {
769 "next": "romance-group",
770 },
771 "Scottish Gaelic": {
772 "numbers": ["singular", "dual", "plural"],
773 },
774 "Serbo-Croatian": {
775 "next": "slavic-group",
776 "numbers": ["singular", "dual", "paucal", "plural"],
777 },
778 "Sicilian": {
779 "next": "romance-group",
780 },
781 "Skolt Sami": {
782 "next": "samojedic-group",
783 },
784 "Slovene": {
785 "next": "slavic-group",
786 },
787 "Shona": {
788 "next": "bantu-group",
789 },
790 "Sotho": {
791 "next": "bantu-group",
792 },
793 "South Levantine Arabic": {
794 "next": "semitic-group",
795 },
796 "Southern Ndebele": {
797 "next": "bantu-group",
798 },
799 "Spanish": {
800 "next": "romance-group",
801 "form_transformations": [
802 ["verb", "^no ", "", "negative"],
803 ],
804 "special_references": {
805 "vos": "informal vos-form second-person singular",
806 "ᵛᵒˢ": "informal vos-form second-person singular",
807 "tú": "informal second-person singular",
808 },
809 },
810 "Swahili": {
811 "next": "bantu-group",
812 },
813 "Swedish": {
814 "hdr_expand_first": set(["referent"]),
815 "hdr_expand_cont": set(["degree", "polarity"]),
816 "genders": ["common-gender", "feminine", "masculine", "neuter"],
817 },
818 "Swazi": {
819 "next": "bantu-group",
820 },
821 # "Syriac": {
822 # "next": "semitic-group",
823 # },
824 "Tagalog": {
825 "next": "austronesian-group",
826 },
827 "Tausug": {
828 "next": "austronesian-group",
829 },
830 "Tigre": {
831 "next": "semitic-group",
832 },
833 "Tigrinya": {
834 "next": "semitic-group",
835 },
836 "Tongan": {
837 "next": "austronesian-group",
838 },
839 "Tsonga": {
840 "next": "bantu-group",
841 },
842 "Tswana": {
843 "next": "bantu-group",
844 },
845 "Tumbuka": {
846 "next": "bantu-group",
847 },
848 # "Tuscan": {
849 # "next": "romance-group",
850 # },
851 "Ugaritic": {
852 "next": "semitic-group",
853 },
854 "Ukrainian": {
855 "next": "slavic-group",
856 },
857 "Upper Sorbian": {
858 "next": "slavic-group",
859 },
860 # "Valencian": {
861 # "next": "romance-group",
862 # },
863 "Venetian": {
864 "next": "romance-group",
865 },
866 "Warlpiri": {
867 "numbers": ["singular", "paucal", "plural"],
868 },
869 "Xhosa": {
870 "next": "bantu-group",
871 },
872 "Zulu": {
873 "next": "bantu-group",
874 },
875 "ǃXóõ": {
876 "next": "bantu-group",
877 },
878}
881# Sanity check lang_specific
882# def_ls_keys = lang_specific["default"].keys()
883# for k, v in lang_specific.items():
884# if k[0].isupper() and k not in languages_by_name:
885# raise AssertionError(
886# "key {!r} in lang_specific is not a valid language"
887# .format(k))
888# assert isinstance(v, dict)
889# for kk, vv in v.items():
890# if kk not in def_ls_keys and kk != "next":
891# raise AssertionError("{} key {!r} not in default entry"
892# .format(k, kk))
893# if kk in ("hdr_expand_first", "hdr_expand_cont"):
894# if not isinstance(vv, set):
895# raise AssertionError("{} key {!r} must be set"
896# .format(lang, kk))
897# for t in vv:
898# if t not in tag_categories:
899# raise AssertionError("{} key {!r} invalid tag category {}"
900# .format(k, kk, t))
901# elif kk in ("genders", "numbers", "persons", "strengths", "voices"):
902# if not vv:
903# continue
904# if not isinstance(vv, (list, tuple, set)):
905# raise AssertionError("{} key {!r} must be list/tuple/set"
906# .format(k, kk))
907# for t in vv:
908# if t not in valid_tags:
909# raise AssertionError("{} key {!r} invalid tag {!r}"
910# .format(k, kk, t))
911# elif kk == "lang_tag_mappings" and vv is not None:
912# for pos, transf in vv.items():
913# assert pos in PARTS_OF_SPEECH
914# assert isinstance(transf, dict)
915# for pre, post in transf.items():
916# assert isinstance(pre, tuple)
917# assert all(t in valid_tags for t in pre)
918# assert isinstance(post, list)
919# assert all(t in valid_tags for t in post)
920# elif kk == "next":
921# if vv not in lang_specific:
922# raise AssertionError("{} key {!r} value {!r} is not defined"
923# .format(k, kk, vv))
926def get_lang_conf(lang, field):
927 """Returns the given field from language-specific data or "default"
928 if the language is not listed or does not have the field."""
929 assert isinstance(lang, str)
930 assert isinstance(field, str)
931 while True:
932 lconfigs = lang_specific.get(lang)
933 if lconfigs is None:
934 lang = "default"
935 elif lang == "default" and field not in lconfigs: 935 ↛ 936line 935 didn't jump to line 936 because the condition on line 935 was never true
936 raise RuntimeError("Invalid lang_specific field {!r}".format(field))
937 else:
938 if field in lconfigs:
939 return lconfigs[field]
940 lang = lconfigs.get("next", "default")
943def lang_specific_tags(lang, pos, form):
944 """Extracts tags from the word form itself in a language-specific way.
945 This may also adjust the word form.
946 For example, German inflected verb forms don't have person and number
947 specified in the table, but include a pronoun. This returns adjusted
948 form and a list of tags."""
949 assert isinstance(lang, str)
950 assert isinstance(pos, str)
951 assert isinstance(form, str)
952 rules = get_lang_conf(lang, "form_transformations")
953 for patpos, pattern, dst, tags in rules:
954 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > ""
955 assert patpos in PARTS_OF_SPEECH
956 if pos != patpos:
957 continue
958 m = re.search(pattern, form)
959 if not m:
960 continue
961 form = form[: m.start()] + dst + form[m.end() :]
962 tags = tags.split()
963 for t in tags:
964 assert t in valid_tags
965 return form, tags
966 return form, []