Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 82%
41 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1# Language-specific configuration for various aspects of inflection table
2# parsing.
4import re
5from typing import Optional, TypedDict, Union
7from ...tags import valid_tags
8from .parts_of_speech import PARTS_OF_SPEECH
10LangConfDict = TypedDict(
11 "LangConfDict",
12 {
13 "next": str,
14 "hdr_expand_first": set[str],
15 "hdr_expand_cont": set[str],
16 "animate_inanimate_remove": bool,
17 "both_active_passive_remove": bool,
18 "both_strong_weak_remove": bool,
19 "definitenesses": list[str],
20 "empty_row_resets": bool,
21 "form_transformations": list[
22 list[tuple[str, ...] | str]
23 ], # tag extraction, lang_specific_tags()
24 "genders": Optional[list[str]],
25 "imperative_no_tense": bool,
26 "masc_only_animate": bool, # Slavic special
27 "numbers": list[str],
28 "persons": list[str],
29 "pl_virile_nonvirile": bool,
30 "reuse_cellspan": str, # stop/skip/reuse
31 "skip_mood_mood": bool,
32 "skip_tense_tense": bool,
33 "stop_non_finite_non_finite": bool,
34 "stop_non_finite_voice": bool,
35 "stop_non_finite_tense": bool,
36 "strengths": list[str],
37 "virile_nonvirile_remove": bool,
38 "voices": list[str],
39 "special_phrase_splits": dict[
40 str, list[Union[list[str], str]]
41 ], # value: (split phrase, tags)
42 "form_replacements": dict[
43 str, Union[str, list[str]]
44 ], # value: [replacement, tags]
45 # Greek-style bracket semantics
46 "parentheses_for_informal": bool,
47 "square_brackets_for_rare": bool,
48 "curly_brackets_for_archaic": bool,
49 # Armenian; migrated old data here
50 "lang_tag_mappings": Optional[
51 dict[str, dict[tuple[str, ...], list[str]]]
52 ],
53 # Spanish has a lot of "vos" and "tú" in its tables that look like
54 # references, and they give their form certain tags.
55 # Dict of references ("vos") that point to tag strings "first-person
56 # singular" that *extend* tags.
57 "special_references": Optional[dict[str, str]],
58 # Some languages like Icelandic and Faroese have text cells in the
59 # upper left that we'd like to ignore.
60 "ignore_top_left_text_cell": bool,
61 # Minor regex replacements for cleanup in parse_simple_table()
62 "minor_text_cleanups": Optional[
63 dict[str, str]
64 ], # dict of {regex: substitution}
65 "articles_in_separate_columns": bool,
66 # Cells to ignore in this language, unless the cell has the key
67 # as a tag.
68 "conditionally_ignored_cells": dict[str, list[str]],
69 "remove_text_patterns": dict[
70 tuple[str, ...], tuple[str | re.Pattern, ...]
71 ]
72 | None,
73 },
74 total=False,
75)
77lang_specific: dict[str, LangConfDict] = {
78 "default": {
79 "hdr_expand_first": set(
80 [
81 "number",
82 "mood",
83 "referent",
84 "aspect",
85 "tense",
86 "voice",
87 "non-finite",
88 "case",
89 "possession",
90 ]
91 ),
92 "hdr_expand_cont": set(
93 [
94 "person",
95 "gender",
96 "number",
97 "degree",
98 "polarity",
99 "voice",
100 "misc",
101 ]
102 ),
103 "animate_inanimate_remove": True,
104 "both_active_passive_remove": True,
105 "both_strong_weak_remove": True,
106 "definitenesses": ["indefinite", "definite"],
107 "empty_row_resets": False,
108 "form_transformations": [], # tag extraction, lang_specific_tags()
109 "genders": None,
110 "imperative_no_tense": False,
111 "masc_only_animate": False, # Slavic special
112 "numbers": ["singular", "plural"],
113 "persons": ["first-person", "second-person", "third-person"],
114 "pl_virile_nonvirile": False,
115 "reuse_cellspan": "skip", # stop/skip/reuse
116 "skip_mood_mood": False,
117 "skip_tense_tense": False,
118 "stop_non_finite_non_finite": True,
119 "stop_non_finite_voice": False,
120 "stop_non_finite_tense": False,
121 "strengths": ["strong", "weak"],
122 "virile_nonvirile_remove": True,
123 "voices": ["active", "passive"],
124 "special_phrase_splits": {}, # value: (split phrase, tags)
125 "form_replacements": {}, # value: [replacement, tags]
126 # Greek-style bracket semantics
127 "parentheses_for_informal": False,
128 "square_brackets_for_rare": False,
129 "curly_brackets_for_archaic": False,
130 # Armenian; migrated old data here
131 "lang_tag_mappings": None,
132 # Spanish has a lot of "vos" and "tú" in its tables that look like
133 # references, and they give their form certain tags.
134 # Dict of references ("vos") that point to tag strings "first-person
135 # singular" that *extend* tags.
136 "special_references": None,
137 # Some languages like Icelandic and Faroese have text cells in the
138 # upper left that we'd like to ignore.
139 "ignore_top_left_text_cell": False,
140 # Minor regex replacements for cleanup in parse_simple_table()
141 "minor_text_cleanups": None, # dict of {regex: substitution}
142 "articles_in_separate_columns": False,
143 # Cells to ignore in this language, unless the cell has the key
144 # as a tag.
145 "conditionally_ignored_cells": {},
146 "remove_text_patterns": None,
147 },
148 "austronesian-group": {
149 "numbers": ["singular", "dual", "plural"],
150 },
151 "bantu-group": {
152 "genders": None,
153 },
154 "indo-european-group": {
155 "genders": ["masculine", "feminine", "neuter"],
156 "numbers": ["singular", "plural"],
157 },
158 "romance-group": {},
159 "slavic-group": {
160 "numbers": ["singular", "plural", "dual"],
161 "masc_only_animate": True,
162 },
163 "samojedic-group": {
164 "next": "uralic-group",
165 },
166 "semitic-group": {
167 "numbers": ["singular", "dual", "plural"],
168 "definitenesses": ["indefinite", "definite", "construct"],
169 },
170 "uralic-group": {
171 "numbers": ["singular", "dual", "plural"],
172 },
173 "german-group": { # languages closely related to or offshot from German
174 "next": "germanic-group",
175 "articles_in_separate_columns": True,
176 },
177 "germanic-group": { # Germanic languages as a whole
178 "next": "indo-european-group",
179 },
180 "Akkadian": {
181 "next": "semitic-group",
182 },
183 "Alemannic German": {
184 "next": "German",
185 },
186 "Amharic": {
187 "next": "semitic-group",
188 },
189 "Ancient Greek": {
190 "next": "Proto-Indo-European", # Has dual
191 "remove_text_patterns": {
192 ("noun", "name"): (
193 # Used to remove the gendered article alternatives at the start
194 # of table entries like ἰχθυοκένταυρος / Ancient Greek
195 re.compile(
196 r"(?m)^(ā |ai |hā |hai |hē |ho |ho / hē |ho, hē |hoi |"
197 r"hoi / hai |hoi, hai |o |oi |tằ |tâ |taì |tâi |"
198 r"taîs |tân |tān |tān |tâs |tā̀s |têi |tēî |têisĭ |"
199 r"têisĭ |tḕn |tês |tò |tô |tṑ |tṑ |toi |toì |tôi |"
200 r"toîn |toîs |toîsĭ |toîsĭ\(n\) |toîsĭn |toîs / taîs |"
201 r"toîs, taîs |tôi, têi |tōî / tēî |tòn |tôn |"
202 r"tòn / tḕn |tòn, tḕn |tòs |tṑs |tṑs |toû |toùs |"
203 r"toùs / tā̀s |toùs, tā̀s |toû / tês |toû, tês )"
204 ),
205 # Main greek pattern
206 re.compile(
207 r"^(ᾱ |ᾱ̔ |αἰ |αἱ |ἡ |ὀ |ὁ |ὁ / ἡ |ὁ, ἡ |οἰ |οἱ |οἱ / αἱ |"
208 r"οἱ, αἱ |τᾰ̀ |τᾶ |τᾷ |ταὶ |ταῖς |τᾶν |τᾱν |τᾱν |τᾶς |τᾱ̀ς |"
209 r"τῇ |τὴν |τῆς |τῇσῐ |τῇσῐν |τὸ |τοι |τοὶ |τοῖ |τοῖν |"
210 r"τοῖς |"
211 r"τοῖσῐ / τοῖσῐν |τοῖς / ταῖς |τοῖς, ταῖς |τὸν |τὸν / τὴν |"
212 r"τὸν, τὴν |τὸς |τοῦ |τοὺς |τοὺς / τᾱ̀ς |τοὺς, τᾱ̀ς |"
213 r"τοῦ / τῆς |τοῦ, τῆς |τὼ |τῶ |τῷ |τῶν |τὼς |τὼς |"
214 r"τῷ / τῇ |τῷ, τῇ |τὼ )"
215 ),
216 ),
217 },
218 },
219 # "Anejom̃": {
220 # "numbers": ["singular", "dual", "trial", "plural"],
221 # },
222 "Arabic": {
223 "next": "semitic-group",
224 "numbers": [
225 "singular",
226 "dual",
227 "paucal",
228 "plural",
229 "collective",
230 "singulative",
231 ],
232 "reuse_cellspan": "reuse",
233 "hdr_expand_first": set(["number"]),
234 "hdr_expand_cont": set(
235 ["gender", "referent", "misc", "number", "class"]
236 ),
237 },
238 "Aragonese": {
239 "next": "romance-group",
240 },
241 "Armenian": {
242 "lang_tag_mappings": {
243 "noun": {
244 ("possessive", "singular"): ["possessive", "possessed-single"],
245 ("possessive", "plural"): ["possessive", "possessed-single"],
246 },
247 },
248 },
249 "Aromanian": {
250 "next": "romance-group",
251 },
252 "Aramaic": {
253 "next": "semitic-group",
254 },
255 "Avestan": {
256 "next": "Proto-Indo-European",
257 },
258 "Bavarian": {
259 "next": "German",
260 },
261 "Baiso": {
262 "numbers": ["singular", "paucal", "plural"],
263 },
264 "Belarusian": {
265 "next": "slavic-group",
266 },
267 "Bende": {
268 "next": "bantu-group",
269 },
270 # "Berber": {
271 # "definitenesses": ["indefinite", "definite", "construct"],
272 # },
273 "Catalan": {
274 "next": "romance-group",
275 },
276 "Chichewa": {
277 "next": "bantu-group",
278 },
279 "Chimwiini": {
280 "next": "bantu-group",
281 },
282 "Cimbrian": {
283 "next": "German",
284 },
285 "Corsican": {
286 "next": "romance-group",
287 },
288 "Czech": {
289 "next": "slavic-group",
290 "hdr_expand_first": set(["tense", "mood", "non-finite"]),
291 "hdr_expand_cont": set(["tense", "mood", "voice"]),
292 },
293 "Dalmatian": {
294 "next": "romance-group",
295 },
296 "Danish": {
297 "genders": ["common-gender", "feminine", "masculine", "neuter"],
298 "remove_text_patterns": {
299 # tuples need the comma to be happy
300 ("noun",): (re.compile(r"^\(as a measure\) "),),
301 },
302 },
303 "Eblaite": {
304 "next": "semitic-group",
305 },
306 "Egyptian": {
307 "definitenesses": ["indefinite", "definite", "construct"],
308 },
309 "Emilian": {
310 "next": "romance-group",
311 },
312 "English": {
313 "stop_non_finite_tense": True, # affect/English/Verb
314 "form_transformations": [
315 ["verb", r"^\(to\) ", "", ""],
316 ["verb", "^to ", "", ""],
317 ["verb", r"^I ", "", "first-person singular"],
318 ["verb", r"^you ", "", "second-person"],
319 ["verb", r"^he ", "", "third-person singular"],
320 ["verb", r"^we ", "", "first-person plural"],
321 ["verb", r"^they ", "", "third-person"],
322 ["verb", r"^it ", "", "third-person singular"],
323 ["verb", r"^thou ", "", "second-person singular"],
324 ["verb", r"^ye ", "", "second-person plural"],
325 ["verb", r" \(thou\)$", "", "second-person singular"],
326 ["verb", r" \(ye\)$", "", "second-person plural"],
327 ["verb", r"^he/she/it ", "", "third-person singular"],
328 ["verb", r"^he/she/it/they ", "", "third-person singular"],
329 ["verb", r"\bhim/her/it/them ", "", "third-person singular"],
330 ["verb", r"\bthem ", "", "third-person"],
331 ["verb", r"\bus ", "", "first-person plural"],
332 ["verb", r"\bme ", "", "first-person singular"],
333 ],
334 "form_replacements": {
335 "let’s be": ["let's be", "first-person plural pronoun-included"],
336 },
337 "special_phrase_splits": {
338 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"],
339 "we are (’re)/be/been": [
340 ["are (’re)", "be", "been"],
341 "first-person plural",
342 ],
343 "thou art (’rt)/beest": [
344 ["art (’rt)", "beest"],
345 "second-person singular",
346 ],
347 "ye are (’re)/be/been": [
348 ["are (’re)", "be", "been"],
349 "second-person plural",
350 ],
351 "thou be/beest": [["be", "beest"], "second-person singular"],
352 "he/she/it is (’s)/beeth/bes": [
353 ["is (’s)", "beeth", "bes"],
354 "third-person singular",
355 ],
356 "they are (’re)/be/been": [
357 ["are (’re)", "be", "been"],
358 "third-person plural",
359 ],
360 "thou wert/wast": [["wert", "wast"], "second-person singular"],
361 "thou were/wert": [["were", "wert"], "second-person singular"],
362 "there has been": [["there has been"], "singular"],
363 "there have been": [["there have been"], "plural"],
364 "there is ('s)": [["there is", "there's"], "singular"],
365 "there are ('re)": [["there are", "there're"], "plural"],
366 "there was": [["there was"], "singular"],
367 "there were": [["there were"], "plural"],
368 },
369 },
370 "Estonian": {
371 "hdr_expand_first": set(["non-finite"]),
372 "hdr_expand_cont": set(["voice"]),
373 },
374 "Faroese": {
375 "ignore_top_left_text_cell": True,
376 },
377 "Fijian": {
378 "numbers": ["singular", "paucal", "plural"],
379 },
380 "Finnish": {
381 "hdr_expand_first": set([]),
382 },
383 "French": {
384 "next": "romance-group",
385 },
386 "Friulian": {
387 "next": "romance-group",
388 },
389 "Galician": {
390 "next": "romance-group",
391 },
392 "German": {
393 "next": "german-group",
394 "form_transformations": [
395 ["verb", "^ich ", "", "first-person singular"],
396 ["verb", "^du ", "", "second-person singular"],
397 ["verb", "^er ", "", "third-person singular"],
398 ["verb", "^wir ", "", "first-person plural"],
399 ["verb", "^ihr ", "", "second-person plural"],
400 ["verb", "^sie ", "", "third-person plural"],
401 [
402 "verb",
403 "^dass ich ",
404 "",
405 "first-person singular subordinate-clause",
406 ],
407 [
408 "verb",
409 "^dass du ",
410 "",
411 "second-person singular subordinate-clause",
412 ],
413 [
414 "verb",
415 "^dass er ",
416 "",
417 "third-person singular subordinate-clause",
418 ],
419 [
420 "verb",
421 "^dass wir ",
422 "",
423 "first-person plural subordinate-clause",
424 ],
425 [
426 "verb",
427 "^dass ihr ",
428 "",
429 "second-person plural subordinate-clause",
430 ],
431 [
432 "verb",
433 "^dass sie ",
434 "",
435 "third-person plural subordinate-clause",
436 ],
437 ["verb", r" \(du\)$", "", "second-person singular"],
438 ["verb", r" \(ihr\)$", "", "second-person plural"],
439 ["adj", "^er ist ", "", "masculine singular"],
440 ["adj", "^sie ist ", "", "feminine singular"],
441 ["adj", "^es ist ", "", "neuter singular"],
442 ["adj", "^sie sind ", "", "plural"],
443 ["adj", "^keine ", "keine ", "negative"],
444 ["adj", "^keiner ", "keiner ", "negative"],
445 ["adj", "^keinen ", "keinen ", "negative"],
446 ],
447 "conditionally_ignored_cells": {
448 "definite": [
449 "der",
450 "die",
451 "das",
452 "des",
453 "dem",
454 "den",
455 ],
456 "indefinite": [
457 "ein",
458 "eine",
459 "eines",
460 "einer",
461 "einem",
462 "einen",
463 ],
464 "negative": [
465 "kein",
466 "keine",
467 "keiner",
468 "keinen",
469 ],
470 },
471 },
472 "German Low German": {
473 "next": "German",
474 "hdr_expand_first": set(["mood", "non-finite"]),
475 "hdr_expand_cont": set(["tense"]),
476 },
477 "Gothic": {
478 "next": "Proto-Indo-European", # Has dual
479 },
480 "Greek": {
481 "next": "indo-european-group",
482 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]),
483 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]),
484 "imperative_no_tense": True,
485 "reuse_cellspan": "reuse",
486 "skip_mood_mood": True,
487 "skip_tense_tense": True,
488 # είμαι/Greek
489 "parentheses_for_informal": True,
490 "square_brackets_for_rare": True,
491 "curly_brackets_for_archaic": True,
492 # For greek originally
493 "minor_text_cleanups": {
494 r"\s+➤\s*$": "",
495 },
496 },
497 "Hawaiian": {
498 "next": "austronesian-group",
499 },
500 "Hebrew": {
501 "next": "semitic-group",
502 },
503 "Hijazi Arabic": {
504 "next": "semitic-group",
505 },
506 "Hopi": {
507 "numbers": ["singular", "paucal", "plural"],
508 },
509 "Hungarian": {
510 "hdr_expand_first": set([]),
511 "hdr_expand_cont": set([]),
512 },
513 "Hunsrik": {
514 "next": "German",
515 },
516 "Icelandic": {
517 "ignore_top_left_text_cell": True,
518 },
519 "Ilokano": {
520 "next": "austronesian-group",
521 },
522 "Inari Sami": {
523 "next": "samojedic-group",
524 },
525 "Inuktitut": {
526 "numbers": ["singular", "dual", "plural"],
527 },
528 "Italian": {
529 "next": "romance-group",
530 "hdr_expand_first": set(["mood", "tense"]),
531 "hdr_expand_cont": set(["person", "register", "number", "misc"]),
532 "form_transformations": [
533 ["verb", "^non ", "", "negative"],
534 ],
535 },
536 "Irish": {
537 "next": "Old Irish",
538 "genders": ["masculine", "feminine"],
539 },
540 "Kamba": {
541 "next": "bantu-group",
542 },
543 "Kapampangan": {
544 "next": "austronesian-group",
545 },
546 # "Khoe": {
547 # "numbers": ["singular", "dual", "plural"],
548 # },
549 "Kikuyu": {
550 "next": "bantu-group",
551 },
552 "Ladin": {
553 "next": "romance-group",
554 },
555 # "Larike": {
556 # "numbers": ["singular", "dual", "trial", "plural"],
557 # },
558 "Latin": {
559 "next": "romance-group",
560 "stop_non_finite_voice": True,
561 },
562 "Latvian": {
563 "empty_row_resets": True,
564 },
565 "Ligurian": {
566 "next": "romance-group",
567 },
568 "Lihir": {
569 "numbers": ["singular", "dual", "trial", "paucal", "plural"],
570 },
571 "Lingala": {
572 "next": "bantu-group",
573 },
574 "Lombard": {
575 "next": "romance-group",
576 },
577 "Lower Sorbian": {
578 "next": "slavic-group",
579 },
580 "Luganda": {
581 "next": "bantu-group",
582 },
583 "Lule Sami": {
584 "next": "samojedic-group",
585 },
586 "Luxembourgish": {
587 "next": "German",
588 },
589 "Maltese": {
590 "next": "semitic-group",
591 },
592 "Maore Comorian": {
593 "next": "bantu-group",
594 },
595 "Masaba": {
596 "next": "bantu-group",
597 },
598 "Mirandese": {
599 "next": "romance-group",
600 },
601 "Moroccan Arabic": {
602 "next": "semitic-group",
603 },
604 # "Motuna": {
605 # "numbers": ["singular", "paucal", "plural"],
606 # },
607 "Mwali Comorian": {
608 "next": "bantu-group",
609 },
610 "Mwani": {
611 "next": "bantu-group",
612 },
613 "Navajo": {
614 "numbers": [
615 "singular",
616 "plural",
617 "dual",
618 "duoplural",
619 ],
620 },
621 "Neapolitan": {
622 "next": "romance-group",
623 },
624 "Nenets": {
625 "next": "uralic-group",
626 },
627 "Ngazidja Comorian": {
628 "next": "bantu-group",
629 },
630 "Niuean": {
631 "next": "austronesian-group",
632 },
633 "Northern Kurdish": {
634 "numbers": ["singular", "paucal", "plural"],
635 },
636 "Northern Ndebele": {
637 "next": "bantu-group",
638 },
639 "Northern Sami": {
640 "next": "samojedic-group",
641 },
642 # "Mussau": {
643 # "numbers": ["singular", "dual", "trial", "plural"],
644 # },
645 "Nyankole": {
646 "next": "bantu-group",
647 },
648 "Occitan": {
649 "next": "romance-group",
650 },
651 "Old Church Slavonic": {
652 "next": "Proto-Indo-European", # Has dual
653 },
654 "Old English": {
655 "next": "Proto-Indo-European", # Had dual in pronouns
656 },
657 "Old Norse": {
658 "next": "Proto-Indo-European", # Had dual in pronouns
659 },
660 "Old Irish": {
661 "next": "Proto-Indo-European", # Has dual
662 },
663 "Pennsylvania German": {
664 "next": "German",
665 },
666 "Phoenician": {
667 "next": "semitic-group",
668 },
669 "Phuthi": {
670 "next": "bantu-group",
671 },
672 "Pite Sami": {
673 "next": "samojedic-group",
674 },
675 "Polish": {
676 "next": "slavic-group",
677 },
678 "Portuguese": {
679 "next": "romance-group",
680 "genders": ["masculine", "feminine"],
681 },
682 "Proto-Germanic": {
683 "next": "Proto-Indo-European", # Has dual
684 },
685 "Proto-Indo-European": {
686 "numbers": ["singular", "dual", "plural"],
687 },
688 "Proto-Samic": {
689 "next": "samojedic-group",
690 },
691 "Proto-Uralic": {
692 "next": "uralic-group",
693 },
694 "Raga": {
695 "numbers": ["singular", "dual", "trial", "plural"],
696 },
697 "Romagnol": {
698 "next": "romance-group",
699 },
700 "Romanian": {
701 "next": "romance-group",
702 },
703 "Romansch": {
704 "next": "romance-group",
705 },
706 "Russian": {
707 "next": "slavic-group",
708 "hdr_expand_first": set(["non-finite", "mood", "tense"]),
709 "hdr_expand_cont": set(["tense", "number"]),
710 "reuse_cellspan": "stop",
711 },
712 "Rwanda-Rundi": {
713 "next": "bantu-group",
714 },
715 "Sanskrit": {
716 "next": "Proto-Indo-European",
717 },
718 "Sardinian": {
719 "next": "romance-group",
720 },
721 "Sassarese": {
722 "next": "romance-group",
723 },
724 "Scottish Gaelic": {
725 "numbers": ["singular", "dual", "plural"],
726 },
727 "Serbo-Croatian": {
728 "next": "slavic-group",
729 "numbers": ["singular", "dual", "paucal", "plural"],
730 },
731 "Sicilian": {
732 "next": "romance-group",
733 },
734 "Skolt Sami": {
735 "next": "samojedic-group",
736 },
737 "Slovene": {
738 "next": "slavic-group",
739 },
740 "Shona": {
741 "next": "bantu-group",
742 },
743 "Sotho": {
744 "next": "bantu-group",
745 },
746 "South Levantine Arabic": {
747 "next": "semitic-group",
748 },
749 "Southern Ndebele": {
750 "next": "bantu-group",
751 },
752 "Spanish": {
753 "next": "romance-group",
754 "form_transformations": [
755 ["verb", "^no ", "", "negative"],
756 ],
757 "special_references": {
758 "vos": "informal vos-form second-person singular",
759 "ᵛᵒˢ": "informal vos-form second-person singular",
760 "tú": "informal second-person singular",
761 },
762 },
763 "Swahili": {
764 "next": "bantu-group",
765 },
766 "Swedish": {
767 "hdr_expand_first": set(["referent"]),
768 "hdr_expand_cont": set(["degree", "polarity"]),
769 "genders": ["common-gender", "feminine", "masculine", "neuter"],
770 },
771 "Swazi": {
772 "next": "bantu-group",
773 },
774 # "Syriac": {
775 # "next": "semitic-group",
776 # },
777 "Tagalog": {
778 "next": "austronesian-group",
779 },
780 "Tausug": {
781 "next": "austronesian-group",
782 },
783 "Tigre": {
784 "next": "semitic-group",
785 },
786 "Tigrinya": {
787 "next": "semitic-group",
788 },
789 "Tongan": {
790 "next": "austronesian-group",
791 },
792 "Tsonga": {
793 "next": "bantu-group",
794 },
795 "Tswana": {
796 "next": "bantu-group",
797 },
798 "Tumbuka": {
799 "next": "bantu-group",
800 },
801 # "Tuscan": {
802 # "next": "romance-group",
803 # },
804 "Ugaritic": {
805 "next": "semitic-group",
806 },
807 "Ukrainian": {
808 "next": "slavic-group",
809 },
810 "Upper Sorbian": {
811 "next": "slavic-group",
812 },
813 # "Valencian": {
814 # "next": "romance-group",
815 # },
816 "Venetian": {
817 "next": "romance-group",
818 },
819 "Warlpiri": {
820 "numbers": ["singular", "paucal", "plural"],
821 },
822 "Xhosa": {
823 "next": "bantu-group",
824 },
825 "Zulu": {
826 "next": "bantu-group",
827 },
828 "ǃXóõ": {
829 "next": "bantu-group",
830 },
831}
834# Sanity check lang_specific
835# def_ls_keys = lang_specific["default"].keys()
836# for k, v in lang_specific.items():
837# if k[0].isupper() and k not in languages_by_name:
838# raise AssertionError(
839# "key {!r} in lang_specific is not a valid language"
840# .format(k))
841# assert isinstance(v, dict)
842# for kk, vv in v.items():
843# if kk not in def_ls_keys and kk != "next":
844# raise AssertionError("{} key {!r} not in default entry"
845# .format(k, kk))
846# if kk in ("hdr_expand_first", "hdr_expand_cont"):
847# if not isinstance(vv, set):
848# raise AssertionError("{} key {!r} must be set"
849# .format(lang, kk))
850# for t in vv:
851# if t not in tag_categories:
852# raise AssertionError("{} key {!r} invalid tag category {}"
853# .format(k, kk, t))
854# elif kk in ("genders", "numbers", "persons", "strengths", "voices"):
855# if not vv:
856# continue
857# if not isinstance(vv, (list, tuple, set)):
858# raise AssertionError("{} key {!r} must be list/tuple/set"
859# .format(k, kk))
860# for t in vv:
861# if t not in valid_tags:
862# raise AssertionError("{} key {!r} invalid tag {!r}"
863# .format(k, kk, t))
864# elif kk == "lang_tag_mappings" and vv is not None:
865# for pos, transf in vv.items():
866# assert pos in PARTS_OF_SPEECH
867# assert isinstance(transf, dict)
868# for pre, post in transf.items():
869# assert isinstance(pre, tuple)
870# assert all(t in valid_tags for t in pre)
871# assert isinstance(post, list)
872# assert all(t in valid_tags for t in post)
873# elif kk == "next":
874# if vv not in lang_specific:
875# raise AssertionError("{} key {!r} value {!r} is not defined"
876# .format(k, kk, vv))
879def get_lang_conf(lang, field):
880 """Returns the given field from language-specific data or "default"
881 if the language is not listed or does not have the field."""
882 assert isinstance(lang, str)
883 assert isinstance(field, str)
884 while True:
885 lconfigs = lang_specific.get(lang)
886 if lconfigs is None:
887 lang = "default"
888 elif lang == "default" and field not in lconfigs: 888 ↛ 889line 888 didn't jump to line 889 because the condition on line 888 was never true
889 raise RuntimeError("Invalid lang_specific field {!r}".format(field))
890 else:
891 if field in lconfigs:
892 return lconfigs[field]
893 lang = lconfigs.get("next", "default")
896def lang_specific_tags(lang, pos, form):
897 """Extracts tags from the word form itself in a language-specific way.
898 This may also adjust the word form.
899 For example, German inflected verb forms don't have person and number
900 specified in the table, but include a pronoun. This returns adjusted
901 form and a list of tags."""
902 assert isinstance(lang, str)
903 assert isinstance(pos, str)
904 assert isinstance(form, str)
905 rules = get_lang_conf(lang, "form_transformations")
906 for patpos, pattern, dst, tags in rules:
907 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > ""
908 if isinstance(patpos, tuple): 908 ↛ 909line 908 didn't jump to line 909 because the condition on line 908 was never true
909 for p in patpos:
910 assert p in PARTS_OF_SPEECH
911 if pos not in patpos:
912 continue
913 else:
914 assert patpos in PARTS_OF_SPEECH
915 if pos != patpos:
916 continue
917 m = re.search(pattern, form)
918 if not m:
919 continue
920 form = form[: m.start()] + dst + form[m.end() :]
921 tags = tags.split()
922 for t in tags:
923 assert t in valid_tags
924 return form, tags
925 return form, []