Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 96%
36 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Language-specific configuration for various aspects of inflection table
2# parsing.
4import re
5from typing import Optional, TypedDict, Union
7from ...tags import valid_tags
8from .parts_of_speech import PARTS_OF_SPEECH
10LangConfDict = TypedDict(
11 "LangConfDict",
12 {
13 "next": str,
14 "hdr_expand_first": set[str],
15 "hdr_expand_cont": set[str],
16 "animate_inanimate_remove": bool,
17 "both_active_passive_remove": bool,
18 "both_strong_weak_remove": bool,
19 "definitenesses": list[str],
20 "empty_row_resets": bool,
21 "form_transformations": list[
22 list[str]
23 ], # tag extraction, lang_specific_tags()
24 "genders": Optional[list[str]],
25 "imperative_no_tense": bool,
26 "masc_only_animate": bool, # Slavic special
27 "numbers": list[str],
28 "persons": list[str],
29 "pl_virile_nonvirile": bool,
30 "reuse_cellspan": str, # stop/skip/reuse
31 "skip_mood_mood": bool,
32 "skip_tense_tense": bool,
33 "stop_non_finite_non_finite": bool,
34 "stop_non_finite_voice": bool,
35 "stop_non_finite_tense": bool,
36 "strengths": list[str],
37 "virile_nonvirile_remove": bool,
38 "voices": list[str],
39 "special_phrase_splits": dict[
40 str, list[Union[list[str], str]]
41 ], # value: (split phrase, tags)
42 "form_replacements": dict[
43 str, Union[str, list[str]]
44 ], # value: [replacement, tags]
45 # Greek-style bracket semantics
46 "parentheses_for_informal": bool,
47 "square_brackets_for_rare": bool,
48 "curly_brackets_for_archaic": bool,
49 # Armenian; migrated old data here
50 "lang_tag_mappings": Optional[
51 dict[str, dict[tuple[str, ...], list[str]]]
52 ],
53 # Spanish has a lot of "vos" and "tú" in its tables that look like
54 # references, and they give their form certain tags.
55 # Dict of references ("vos") that point to tag strings "first-person
56 # singular" that *extend* tags.
57 "special_references": Optional[dict[str, str]],
58 # Some languages like Icelandic and Faroese have text cells in the
59 # upper left that we'd like to ignore.
60 "ignore_top_left_text_cell": bool,
61 # Minor regex replacements for cleanup in parse_simple_table()
62 "minor_text_cleanups": Optional[
63 dict[str, str]
64 ], # dict of {regex: substitution}
65 "articles_in_separate_columns": bool,
66 # Cells to ignore in this language, unless the cell has the key
67 # as a tag.
68 "conditionally_ignored_cells": dict[str, list[str]],
69 },
70 total=False,
71)
73lang_specific: dict[str, LangConfDict] = {
74 "default": {
75 "hdr_expand_first": set(
76 [
77 "number",
78 "mood",
79 "referent",
80 "aspect",
81 "tense",
82 "voice",
83 "non-finite",
84 "case",
85 "possession",
86 ]
87 ),
88 "hdr_expand_cont": set(
89 [
90 "person",
91 "gender",
92 "number",
93 "degree",
94 "polarity",
95 "voice",
96 "misc",
97 ]
98 ),
99 "animate_inanimate_remove": True,
100 "both_active_passive_remove": True,
101 "both_strong_weak_remove": True,
102 "definitenesses": ["indefinite", "definite"],
103 "empty_row_resets": False,
104 "form_transformations": [], # tag extraction, lang_specific_tags()
105 "genders": None,
106 "imperative_no_tense": False,
107 "masc_only_animate": False, # Slavic special
108 "numbers": ["singular", "plural"],
109 "persons": ["first-person", "second-person", "third-person"],
110 "pl_virile_nonvirile": False,
111 "reuse_cellspan": "skip", # stop/skip/reuse
112 "skip_mood_mood": False,
113 "skip_tense_tense": False,
114 "stop_non_finite_non_finite": True,
115 "stop_non_finite_voice": False,
116 "stop_non_finite_tense": False,
117 "strengths": ["strong", "weak"],
118 "virile_nonvirile_remove": True,
119 "voices": ["active", "passive"],
120 "special_phrase_splits": {}, # value: (split phrase, tags)
121 "form_replacements": {}, # value: [replacement, tags]
122 # Greek-style bracket semantics
123 "parentheses_for_informal": False,
124 "square_brackets_for_rare": False,
125 "curly_brackets_for_archaic": False,
126 # Armenian; migrated old data here
127 "lang_tag_mappings": None,
128 # Spanish has a lot of "vos" and "tú" in its tables that look like
129 # references, and they give their form certain tags.
130 # Dict of references ("vos") that point to tag strings "first-person
131 # singular" that *extend* tags.
132 "special_references": None,
133 # Some languages like Icelandic and Faroese have text cells in the
134 # upper left that we'd like to ignore.
135 "ignore_top_left_text_cell": False,
136 # Minor regex replacements for cleanup in parse_simple_table()
137 "minor_text_cleanups": None, # dict of {regex: substitution}
138 "articles_in_separate_columns": False,
139 # Cells to ignore in this language, unless the cell has the key
140 # as a tag.
141 "conditionally_ignored_cells": {},
142 },
143 "austronesian-group": {
144 "numbers": ["singular", "dual", "plural"],
145 },
146 "bantu-group": {
147 "genders": None,
148 },
149 "indo-european-group": {
150 "genders": ["masculine", "feminine", "neuter"],
151 "numbers": ["singular", "plural"],
152 },
153 "romance-group": {},
154 "slavic-group": {
155 "numbers": ["singular", "plural", "dual"],
156 "masc_only_animate": True,
157 },
158 "samojedic-group": {
159 "next": "uralic-group",
160 },
161 "semitic-group": {
162 "numbers": ["singular", "dual", "plural"],
163 "definitenesses": ["indefinite", "definite", "construct"],
164 },
165 "uralic-group": {
166 "numbers": ["singular", "dual", "plural"],
167 },
168 "german-group": { # languages closely related to or offshot from German
169 "next": "germanic-group",
170 "articles_in_separate_columns": True,
171 },
172 "germanic-group": { # Germanic languages as a whole
173 "next": "indo-european-group",
174 },
175 "Akkadian": {
176 "next": "semitic-group",
177 },
178 "Alemannic German": {
179 "next": "German",
180 },
181 "Amharic": {
182 "next": "semitic-group",
183 },
184 "Ancient Greek": {
185 "next": "Proto-Indo-European", # Has dual
186 },
187 # "Anejom̃": {
188 # "numbers": ["singular", "dual", "trial", "plural"],
189 # },
190 "Arabic": {
191 "next": "semitic-group",
192 "numbers": [
193 "singular",
194 "dual",
195 "paucal",
196 "plural",
197 "collective",
198 "singulative",
199 ],
200 "reuse_cellspan": "reuse",
201 "hdr_expand_first": set(["number"]),
202 "hdr_expand_cont": set(
203 ["gender", "referent", "misc", "number", "class"]
204 ),
205 },
206 "Aragonese": {
207 "next": "romance-group",
208 },
209 "Armenian": {
210 "lang_tag_mappings": {
211 "noun": {
212 ("possessive", "singular"): ["possessive", "possessed-single"],
213 ("possessive", "plural"): ["possessive", "possessed-single"],
214 },
215 },
216 },
217 "Aromanian": {
218 "next": "romance-group",
219 },
220 "Aramaic": {
221 "next": "semitic-group",
222 },
223 "Avestan": {
224 "next": "Proto-Indo-European",
225 },
226 "Bavarian": {
227 "next": "German",
228 },
229 "Baiso": {
230 "numbers": ["singular", "paucal", "plural"],
231 },
232 "Belarusian": {
233 "next": "slavic-group",
234 },
235 "Bende": {
236 "next": "bantu-group",
237 },
238 # "Berber": {
239 # "definitenesses": ["indefinite", "definite", "construct"],
240 # },
241 "Catalan": {
242 "next": "romance-group",
243 },
244 "Chichewa": {
245 "next": "bantu-group",
246 },
247 "Chimwiini": {
248 "next": "bantu-group",
249 },
250 "Cimbrian": {
251 "next": "German",
252 },
253 "Corsican": {
254 "next": "romance-group",
255 },
256 "Czech": {
257 "next": "slavic-group",
258 "hdr_expand_first": set(["tense", "mood", "non-finite"]),
259 "hdr_expand_cont": set(["tense", "mood", "voice"]),
260 },
261 "Dalmatian": {
262 "next": "romance-group",
263 },
264 "Danish": {
265 "genders": ["common-gender", "feminine", "masculine", "neuter"],
266 "form_transformations": [
267 ["noun", r"^\(as a measure\) ", "", ""],
268 ],
269 },
270 "Eblaite": {
271 "next": "semitic-group",
272 },
273 "Egyptian": {
274 "definitenesses": ["indefinite", "definite", "construct"],
275 },
276 "Emilian": {
277 "next": "romance-group",
278 },
279 "English": {
280 "stop_non_finite_tense": True, # affect/English/Verb
281 "form_transformations": [
282 ["verb", r"^\(to\) ", "", ""],
283 ["verb", "^to ", "", ""],
284 ["verb", r"^I ", "", "first-person singular"],
285 ["verb", r"^you ", "", "second-person"],
286 ["verb", r"^he ", "", "third-person singular"],
287 ["verb", r"^we ", "", "first-person plural"],
288 ["verb", r"^you ", "", "second-person plural"],
289 ["verb", r"^they ", "", "third-person plural"],
290 ["verb", r"^it ", "", "third-person singular"],
291 ["verb", r"^thou ", "", "second-person singular"],
292 ["verb", r"^ye ", "", "second-person plural"],
293 ["verb", r" \(thou\)$", "", "second-person singular"],
294 ["verb", r" \(ye\)$", "", "second-person plural"],
295 ["verb", r"^he/she/it ", "", "third-person singular"],
296 ["verb", r"^he/she/it/they ", "", "third-person singular"],
297 ["verb", r"\bhim/her/it/them ", "", "third-person singular"],
298 ["verb", r"\bthem ", "", "third-person plural"],
299 ["verb", r"\bus ", "", "first-person plural"],
300 ["verb", r"\bme ", "", "first-person singular"],
301 ],
302 "form_replacements": {
303 "let’s be": ["let's be", "first-person plural pronoun-included"],
304 },
305 "special_phrase_splits": {
306 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"],
307 "we are (’re)/be/been": [
308 ["are (’re)", "be", "been"],
309 "first-person plural",
310 ],
311 "thou art (’rt)/beest": [
312 ["art (’rt)", "beest"],
313 "second-person singular",
314 ],
315 "ye are (’re)/be/been": [
316 ["are (’re)", "be", "been"],
317 "second-person plural",
318 ],
319 "thou be/beest": [["be", "beest"], "second-person singular"],
320 "he/she/it is (’s)/beeth/bes": [
321 ["is (’s)", "beeth", "bes"],
322 "third-person singular",
323 ],
324 "they are (’re)/be/been": [
325 ["are (’re)", "be", "been"],
326 "third-person plural",
327 ],
328 "thou wert/wast": [["wert", "wast"], "second-person singular"],
329 "thou were/wert": [["were", "wert"], "second-person singular"],
330 "there has been": [["there has been"], "singular"],
331 "there have been": [["there have been"], "plural"],
332 "there is ('s)": [["there is", "there's"], "singular"],
333 "there are ('re)": [["there are", "there're"], "plural"],
334 "there was": [["there was"], "singular"],
335 "there were": [["there were"], "plural"],
336 },
337 },
338 "Estonian": {
339 "hdr_expand_first": set(["non-finite"]),
340 "hdr_expand_cont": set(["voice"]),
341 },
342 "Faroese": {
343 "ignore_top_left_text_cell": True,
344 },
345 "Fijian": {
346 "numbers": ["singular", "paucal", "plural"],
347 },
348 "Finnish": {
349 "hdr_expand_first": set([]),
350 },
351 "French": {
352 "next": "romance-group",
353 },
354 "Friulian": {
355 "next": "romance-group",
356 },
357 "Galician": {
358 "next": "romance-group",
359 },
360 "German": {
361 "next": "german-group",
362 "form_transformations": [
363 ["verb", "^ich ", "", "first-person singular"],
364 ["verb", "^du ", "", "second-person singular"],
365 ["verb", "^er ", "", "third-person singular"],
366 ["verb", "^wir ", "", "first-person plural"],
367 ["verb", "^ihr ", "", "second-person plural"],
368 ["verb", "^sie ", "", "third-person plural"],
369 [
370 "verb",
371 "^dass ich ",
372 "",
373 "first-person singular subordinate-clause",
374 ],
375 [
376 "verb",
377 "^dass du ",
378 "",
379 "second-person singular subordinate-clause",
380 ],
381 [
382 "verb",
383 "^dass er ",
384 "",
385 "third-person singular subordinate-clause",
386 ],
387 [
388 "verb",
389 "^dass wir ",
390 "",
391 "first-person plural subordinate-clause",
392 ],
393 [
394 "verb",
395 "^dass ihr ",
396 "",
397 "second-person plural subordinate-clause",
398 ],
399 [
400 "verb",
401 "^dass sie ",
402 "",
403 "third-person plural subordinate-clause",
404 ],
405 ["verb", r" \(du\)$", "", "second-person singular"],
406 ["verb", r" \(ihr\)$", "", "second-person plural"],
407 ["adj", "^er ist ", "", "masculine singular"],
408 ["adj", "^sie ist ", "", "feminine singular"],
409 ["adj", "^es ist ", "", "neuter singular"],
410 ["adj", "^sie sind ", "", "plural"],
411 ["adj", "^keine ", "keine ", "negative"],
412 ["adj", "^keiner ", "keiner ", "negative"],
413 ["adj", "^keinen ", "keinen ", "negative"],
414 ],
415 "conditionally_ignored_cells": {
416 "definite": [
417 "der",
418 "die",
419 "das",
420 "des",
421 "dem",
422 "den",
423 ],
424 "indefinite": [
425 "ein",
426 "eine",
427 "eines",
428 "einer",
429 "einem",
430 "einen",
431 ],
432 "negative": [
433 "kein",
434 "keine",
435 "keiner",
436 "keinen",
437 ],
438 },
439 },
440 "German Low German": {
441 "next": "German",
442 "hdr_expand_first": set(["mood", "non-finite"]),
443 "hdr_expand_cont": set(["tense"]),
444 },
445 "Gothic": {
446 "next": "Proto-Indo-European", # Has dual
447 },
448 "Greek": {
449 "next": "indo-european-group",
450 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]),
451 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]),
452 "imperative_no_tense": True,
453 "reuse_cellspan": "reuse",
454 "skip_mood_mood": True,
455 "skip_tense_tense": True,
456 # είμαι/Greek
457 "parentheses_for_informal": True,
458 "square_brackets_for_rare": True,
459 "curly_brackets_for_archaic": True,
460 # For greek originally
461 "minor_text_cleanups": {
462 r"\s+➤\s*$": "",
463 },
464 },
465 "Hawaiian": {
466 "next": "austronesian-group",
467 },
468 "Hebrew": {
469 "next": "semitic-group",
470 },
471 "Hijazi Arabic": {
472 "next": "semitic-group",
473 },
474 "Hopi": {
475 "numbers": ["singular", "paucal", "plural"],
476 },
477 "Hungarian": {
478 "hdr_expand_first": set([]),
479 "hdr_expand_cont": set([]),
480 },
481 "Hunsrik": {
482 "next": "German",
483 },
484 "Icelandic": {
485 "ignore_top_left_text_cell": True,
486 },
487 "Ilokano": {
488 "next": "austronesian-group",
489 },
490 "Inari Sami": {
491 "next": "samojedic-group",
492 },
493 "Inuktitut": {
494 "numbers": ["singular", "dual", "plural"],
495 },
496 "Italian": {
497 "next": "romance-group",
498 "hdr_expand_first": set(["mood", "tense"]),
499 "hdr_expand_cont": set(["person", "register", "number", "misc"]),
500 "form_transformations": [
501 ["verb", "^non ", "", "negative"],
502 ],
503 },
504 "Irish": {
505 "next": "Old Irish",
506 "genders": ["masculine", "feminine"],
507 },
508 "Kamba": {
509 "next": "bantu-group",
510 },
511 "Kapampangan": {
512 "next": "austronesian-group",
513 },
514 # "Khoe": {
515 # "numbers": ["singular", "dual", "plural"],
516 # },
517 "Kikuyu": {
518 "next": "bantu-group",
519 },
520 "Ladin": {
521 "next": "romance-group",
522 },
523 # "Larike": {
524 # "numbers": ["singular", "dual", "trial", "plural"],
525 # },
526 "Latin": {
527 "next": "romance-group",
528 "stop_non_finite_voice": True,
529 },
530 "Latvian": {
531 "empty_row_resets": True,
532 },
533 "Ligurian": {
534 "next": "romance-group",
535 },
536 "Lihir": {
537 "numbers": ["singular", "dual", "trial", "paucal", "plural"],
538 },
539 "Lingala": {
540 "next": "bantu-group",
541 },
542 "Lombard": {
543 "next": "romance-group",
544 },
545 "Lower Sorbian": {
546 "next": "slavic-group",
547 },
548 "Luganda": {
549 "next": "bantu-group",
550 },
551 "Lule Sami": {
552 "next": "samojedic-group",
553 },
554 "Luxembourgish": {
555 "next": "German",
556 },
557 "Maltese": {
558 "next": "semitic-group",
559 },
560 "Maore Comorian": {
561 "next": "bantu-group",
562 },
563 "Masaba": {
564 "next": "bantu-group",
565 },
566 "Mirandese": {
567 "next": "romance-group",
568 },
569 "Moroccan Arabic": {
570 "next": "semitic-group",
571 },
572 # "Motuna": {
573 # "numbers": ["singular", "paucal", "plural"],
574 # },
575 "Mwali Comorian": {
576 "next": "bantu-group",
577 },
578 "Mwani": {
579 "next": "bantu-group",
580 },
581 "Navajo": {
582 "numbers": [
583 "singular",
584 "plural",
585 "dual",
586 "duoplural",
587 ],
588 },
589 "Neapolitan": {
590 "next": "romance-group",
591 },
592 "Nenets": {
593 "next": "uralic-group",
594 },
595 "Ngazidja Comorian": {
596 "next": "bantu-group",
597 },
598 "Niuean": {
599 "next": "austronesian-group",
600 },
601 "Northern Kurdish": {
602 "numbers": ["singular", "paucal", "plural"],
603 },
604 "Northern Ndebele": {
605 "next": "bantu-group",
606 },
607 "Northern Sami": {
608 "next": "samojedic-group",
609 },
610 # "Mussau": {
611 # "numbers": ["singular", "dual", "trial", "plural"],
612 # },
613 "Nyankole": {
614 "next": "bantu-group",
615 },
616 "Occitan": {
617 "next": "romance-group",
618 },
619 "Old Church Slavonic": {
620 "next": "Proto-Indo-European", # Has dual
621 },
622 "Old English": {
623 "next": "Proto-Indo-European", # Had dual in pronouns
624 },
625 "Old Norse": {
626 "next": "Proto-Indo-European", # Had dual in pronouns
627 },
628 "Old Irish": {
629 "next": "Proto-Indo-European", # Has dual
630 },
631 "Pennsylvania German": {
632 "next": "German",
633 },
634 "Phoenician": {
635 "next": "semitic-group",
636 },
637 "Phuthi": {
638 "next": "bantu-group",
639 },
640 "Pite Sami": {
641 "next": "samojedic-group",
642 },
643 "Polish": {
644 "next": "slavic-group",
645 },
646 "Portuguese": {
647 "next": "romance-group",
648 "genders": ["masculine", "feminine"],
649 },
650 "Proto-Germanic": {
651 "next": "Proto-Indo-European", # Has dual
652 },
653 "Proto-Indo-European": {
654 "numbers": ["singular", "dual", "plural"],
655 },
656 "Proto-Samic": {
657 "next": "samojedic-group",
658 },
659 "Proto-Uralic": {
660 "next": "uralic-group",
661 },
662 "Raga": {
663 "numbers": ["singular", "dual", "trial", "plural"],
664 },
665 "Romagnol": {
666 "next": "romance-group",
667 },
668 "Romanian": {
669 "next": "romance-group",
670 },
671 "Romansch": {
672 "next": "romance-group",
673 },
674 "Russian": {
675 "next": "slavic-group",
676 "hdr_expand_first": set(["non-finite", "mood", "tense"]),
677 "hdr_expand_cont": set(["tense", "number"]),
678 "reuse_cellspan": "stop",
679 },
680 "Rwanda-Rundi": {
681 "next": "bantu-group",
682 },
683 "Sanskrit": {
684 "next": "Proto-Indo-European",
685 },
686 "Sardinian": {
687 "next": "romance-group",
688 },
689 "Sassarese": {
690 "next": "romance-group",
691 },
692 "Scottish Gaelic": {
693 "numbers": ["singular", "dual", "plural"],
694 },
695 "Serbo-Croatian": {
696 "next": "slavic-group",
697 "numbers": ["singular", "dual", "paucal", "plural"],
698 },
699 "Sicilian": {
700 "next": "romance-group",
701 },
702 "Skolt Sami": {
703 "next": "samojedic-group",
704 },
705 "Slovene": {
706 "next": "slavic-group",
707 },
708 "Shona": {
709 "next": "bantu-group",
710 },
711 "Sotho": {
712 "next": "bantu-group",
713 },
714 "South Levantine Arabic": {
715 "next": "semitic-group",
716 },
717 "Southern Ndebele": {
718 "next": "bantu-group",
719 },
720 "Spanish": {
721 "next": "romance-group",
722 "form_transformations": [
723 ["verb", "^no ", "", "negative"],
724 ],
725 "special_references": {
726 "vos": "informal vos-form second-person singular",
727 "ᵛᵒˢ": "informal vos-form second-person singular",
728 "tú": "informal second-person singular",
729 },
730 },
731 "Swahili": {
732 "next": "bantu-group",
733 },
734 "Swedish": {
735 "hdr_expand_first": set(["referent"]),
736 "hdr_expand_cont": set(["degree", "polarity"]),
737 "genders": ["common-gender", "feminine", "masculine", "neuter"],
738 },
739 "Swazi": {
740 "next": "bantu-group",
741 },
742 # "Syriac": {
743 # "next": "semitic-group",
744 # },
745 "Tagalog": {
746 "next": "austronesian-group",
747 },
748 "Tausug": {
749 "next": "austronesian-group",
750 },
751 "Tigre": {
752 "next": "semitic-group",
753 },
754 "Tigrinya": {
755 "next": "semitic-group",
756 },
757 "Tongan": {
758 "next": "austronesian-group",
759 },
760 "Tsonga": {
761 "next": "bantu-group",
762 },
763 "Tswana": {
764 "next": "bantu-group",
765 },
766 "Tumbuka": {
767 "next": "bantu-group",
768 },
769 # "Tuscan": {
770 # "next": "romance-group",
771 # },
772 "Ugaritic": {
773 "next": "semitic-group",
774 },
775 "Ukrainian": {
776 "next": "slavic-group",
777 },
778 "Upper Sorbian": {
779 "next": "slavic-group",
780 },
781 # "Valencian": {
782 # "next": "romance-group",
783 # },
784 "Venetian": {
785 "next": "romance-group",
786 },
787 "Warlpiri": {
788 "numbers": ["singular", "paucal", "plural"],
789 },
790 "Xhosa": {
791 "next": "bantu-group",
792 },
793 "Zulu": {
794 "next": "bantu-group",
795 },
796 "ǃXóõ": {
797 "next": "bantu-group",
798 },
799}
802# Sanity check lang_specific
803# def_ls_keys = lang_specific["default"].keys()
804# for k, v in lang_specific.items():
805# if k[0].isupper() and k not in languages_by_name:
806# raise AssertionError(
807# "key {!r} in lang_specific is not a valid language"
808# .format(k))
809# assert isinstance(v, dict)
810# for kk, vv in v.items():
811# if kk not in def_ls_keys and kk != "next":
812# raise AssertionError("{} key {!r} not in default entry"
813# .format(k, kk))
814# if kk in ("hdr_expand_first", "hdr_expand_cont"):
815# if not isinstance(vv, set):
816# raise AssertionError("{} key {!r} must be set"
817# .format(lang, kk))
818# for t in vv:
819# if t not in tag_categories:
820# raise AssertionError("{} key {!r} invalid tag category {}"
821# .format(k, kk, t))
822# elif kk in ("genders", "numbers", "persons", "strengths", "voices"):
823# if not vv:
824# continue
825# if not isinstance(vv, (list, tuple, set)):
826# raise AssertionError("{} key {!r} must be list/tuple/set"
827# .format(k, kk))
828# for t in vv:
829# if t not in valid_tags:
830# raise AssertionError("{} key {!r} invalid tag {!r}"
831# .format(k, kk, t))
832# elif kk == "lang_tag_mappings" and vv is not None:
833# for pos, transf in vv.items():
834# assert pos in PARTS_OF_SPEECH
835# assert isinstance(transf, dict)
836# for pre, post in transf.items():
837# assert isinstance(pre, tuple)
838# assert all(t in valid_tags for t in pre)
839# assert isinstance(post, list)
840# assert all(t in valid_tags for t in post)
841# elif kk == "next":
842# if vv not in lang_specific:
843# raise AssertionError("{} key {!r} value {!r} is not defined"
844# .format(k, kk, vv))
847def get_lang_conf(lang, field):
848 """Returns the given field from language-specific data or "default"
849 if the language is not listed or does not have the field."""
850 assert isinstance(lang, str)
851 assert isinstance(field, str)
852 while True:
853 lconfigs = lang_specific.get(lang)
854 if lconfigs is None:
855 lang = "default"
856 elif lang == "default" and field not in lconfigs: 856 ↛ 857line 856 didn't jump to line 857 because the condition on line 856 was never true
857 raise RuntimeError("Invalid lang_specific field {!r}".format(field))
858 else:
859 if field in lconfigs:
860 return lconfigs[field]
861 lang = lconfigs.get("next", "default")
864def lang_specific_tags(lang, pos, form):
865 """Extracts tags from the word form itself in a language-specific way.
866 This may also adjust the word form.
867 For example, German inflected verb forms don't have person and number
868 specified in the table, but include a pronoun. This returns adjusted
869 form and a list of tags."""
870 assert isinstance(lang, str)
871 assert isinstance(pos, str)
872 assert isinstance(form, str)
873 rules = get_lang_conf(lang, "form_transformations")
874 for patpos, pattern, dst, tags in rules:
875 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > ""
876 assert patpos in PARTS_OF_SPEECH
877 if pos != patpos:
878 continue
879 m = re.search(pattern, form)
880 if not m:
881 continue
882 form = form[: m.start()] + dst + form[m.end() :]
883 tags = tags.split()
884 for t in tags:
885 assert t in valid_tags
886 return form, tags
887 return form, []