Coverage for src/wiktextract/extractor/en/lang_specific_configs.py: 82%
41 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-18 10:14 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-18 10:14 +0000
1# Language-specific configuration for various aspects of inflection table
2# parsing.
4import re
5from typing import Optional, TypedDict, Union
7from ...tags import valid_tags
8from .parts_of_speech import PARTS_OF_SPEECH
10LangConfDict = TypedDict(
11 "LangConfDict",
12 {
13 "next": str,
14 "hdr_expand_first": set[str],
15 "hdr_expand_cont": set[str],
16 "animate_inanimate_remove": bool,
17 "both_active_passive_remove": bool,
18 "both_strong_weak_remove": bool,
19 "definitenesses": list[str],
20 "empty_row_resets": bool,
21 "form_transformations": list[
22 list[tuple[str, ...] | str]
23 ], # tag extraction, lang_specific_tags()
24 "genders": Optional[list[str]],
25 "imperative_no_tense": bool,
26 "masc_only_animate": bool, # Slavic special
27 "numbers": list[str],
28 "persons": list[str],
29 "pl_virile_nonvirile": bool,
30 "reuse_cellspan": str, # stop/skip/reuse
31 "skip_mood_mood": bool,
32 "skip_tense_tense": bool,
33 "stop_non_finite_non_finite": bool,
34 "stop_non_finite_voice": bool,
35 "stop_non_finite_tense": bool,
36 "strengths": list[str],
37 "virile_nonvirile_remove": bool,
38 "voices": list[str],
39 "special_phrase_splits": dict[
40 str, list[Union[list[str], str]]
41 ], # value: (split phrase, tags)
42 "form_replacements": dict[
43 str, Union[str, list[str]]
44 ], # value: [replacement, tags]
45 # Greek-style bracket semantics
46 "parentheses_for_informal": bool,
47 "square_brackets_for_rare": bool,
48 "curly_brackets_for_archaic": bool,
49 # Armenian; migrated old data here
50 "lang_tag_mappings": Optional[
51 dict[str, dict[tuple[str, ...], list[str]]]
52 ],
53 # Spanish has a lot of "vos" and "tú" in its tables that look like
54 # references, and they give their form certain tags.
55 # Dict of references ("vos") that point to tag strings "first-person
56 # singular" that *extend* tags.
57 "special_references": Optional[dict[str, str]],
58 # Some languages like Icelandic and Faroese have text cells in the
59 # upper left that we'd like to ignore.
60 "ignore_top_left_text_cell": bool,
61 # Minor regex replacements for cleanup in parse_simple_table()
62 "minor_text_cleanups": Optional[
63 dict[str, str]
64 ], # dict of {regex: substitution}
65 "articles_in_separate_columns": bool,
66 # Cells to ignore in this language, unless the cell has the key
67 # as a tag.
68 "conditionally_ignored_cells": dict[str, list[str]],
69 },
70 total=False,
71)
73lang_specific: dict[str, LangConfDict] = {
74 "default": {
75 "hdr_expand_first": set(
76 [
77 "number",
78 "mood",
79 "referent",
80 "aspect",
81 "tense",
82 "voice",
83 "non-finite",
84 "case",
85 "possession",
86 ]
87 ),
88 "hdr_expand_cont": set(
89 [
90 "person",
91 "gender",
92 "number",
93 "degree",
94 "polarity",
95 "voice",
96 "misc",
97 ]
98 ),
99 "animate_inanimate_remove": True,
100 "both_active_passive_remove": True,
101 "both_strong_weak_remove": True,
102 "definitenesses": ["indefinite", "definite"],
103 "empty_row_resets": False,
104 "form_transformations": [], # tag extraction, lang_specific_tags()
105 "genders": None,
106 "imperative_no_tense": False,
107 "masc_only_animate": False, # Slavic special
108 "numbers": ["singular", "plural"],
109 "persons": ["first-person", "second-person", "third-person"],
110 "pl_virile_nonvirile": False,
111 "reuse_cellspan": "skip", # stop/skip/reuse
112 "skip_mood_mood": False,
113 "skip_tense_tense": False,
114 "stop_non_finite_non_finite": True,
115 "stop_non_finite_voice": False,
116 "stop_non_finite_tense": False,
117 "strengths": ["strong", "weak"],
118 "virile_nonvirile_remove": True,
119 "voices": ["active", "passive"],
120 "special_phrase_splits": {}, # value: (split phrase, tags)
121 "form_replacements": {}, # value: [replacement, tags]
122 # Greek-style bracket semantics
123 "parentheses_for_informal": False,
124 "square_brackets_for_rare": False,
125 "curly_brackets_for_archaic": False,
126 # Armenian; migrated old data here
127 "lang_tag_mappings": None,
128 # Spanish has a lot of "vos" and "tú" in its tables that look like
129 # references, and they give their form certain tags.
130 # Dict of references ("vos") that point to tag strings "first-person
131 # singular" that *extend* tags.
132 "special_references": None,
133 # Some languages like Icelandic and Faroese have text cells in the
134 # upper left that we'd like to ignore.
135 "ignore_top_left_text_cell": False,
136 # Minor regex replacements for cleanup in parse_simple_table()
137 "minor_text_cleanups": None, # dict of {regex: substitution}
138 "articles_in_separate_columns": False,
139 # Cells to ignore in this language, unless the cell has the key
140 # as a tag.
141 "conditionally_ignored_cells": {},
142 },
143 "austronesian-group": {
144 "numbers": ["singular", "dual", "plural"],
145 },
146 "bantu-group": {
147 "genders": None,
148 },
149 "indo-european-group": {
150 "genders": ["masculine", "feminine", "neuter"],
151 "numbers": ["singular", "plural"],
152 },
153 "romance-group": {},
154 "slavic-group": {
155 "numbers": ["singular", "plural", "dual"],
156 "masc_only_animate": True,
157 },
158 "samojedic-group": {
159 "next": "uralic-group",
160 },
161 "semitic-group": {
162 "numbers": ["singular", "dual", "plural"],
163 "definitenesses": ["indefinite", "definite", "construct"],
164 },
165 "uralic-group": {
166 "numbers": ["singular", "dual", "plural"],
167 },
168 "german-group": { # languages closely related to or offshot from German
169 "next": "germanic-group",
170 "articles_in_separate_columns": True,
171 },
172 "germanic-group": { # Germanic languages as a whole
173 "next": "indo-european-group",
174 },
175 "Akkadian": {
176 "next": "semitic-group",
177 },
178 "Alemannic German": {
179 "next": "German",
180 },
181 "Amharic": {
182 "next": "semitic-group",
183 },
184 "Ancient Greek": {
185 "next": "Proto-Indo-European", # Has dual
186 "form_transformations": [
187 # Used to remove the gendered article alternatives at the start
188 # of table entries like ἰχθυοκένταυρος / Ancient Greek
189 [("noun", "name"), "^ὁ, ἡ ", "", ""],
190 [("noun", "name"), "^τὼ ", "", ""],
191 [("noun", "name"), "^οἱ, αἱ ", "", ""],
192 [("noun", "name"), "^τοῦ, τῆς ", "", ""],
193 [("noun", "name"), "^τοῖν ", "", ""],
194 [("noun", "name"), "^τῶν ", "", ""],
195 [("noun", "name"), "^τῷ, τῇ ", "", ""],
196 [("noun", "name"), "^τοῖς, ταῖς ", "", ""],
197 [("noun", "name"), "^τὸν, τὴν ", "", ""],
198 [("noun", "name"), "^τὼ ", "", ""],
199 [("noun", "name"), "^τοὺς, τᾱ̀ς ", "", ""],
200 [("noun", "name"), "(?m)^ho, hē ", "", ""],
201 [("noun", "name"), "(?m)^tṑ ", "", ""],
202 [("noun", "name"), "(?m)^hoi, hai ", "", ""],
203 [("noun", "name"), "(?m)^toû, tês ", "", ""],
204 [("noun", "name"), "(?m)^toîn ", "", ""],
205 [("noun", "name"), "(?m)^tôn ", "", ""],
206 [("noun", "name"), "(?m)^tôi, têi ", "", ""],
207 [("noun", "name"), "(?m)^toîs, taîs ", "", ""],
208 [("noun", "name"), "(?m)^tòn, tḕn ", "", ""],
209 [("noun", "name"), "(?m)^tṑ ", "", ""],
210 [("noun", "name"), "(?m)^toùs, tā̀s ", "", ""],
211 # New added ones, leaving the old ones just in case
212 [("noun", "name"), r"^ὁ ", "", ""],
213 [("noun", "name"), r"(?m)^ho ", "", ""],
214 [("noun", "name"), r"^ἡ ", "", ""],
215 [("noun", "name"), r"(?m)^hē ", "", ""],
216 [("noun", "name"), r"^αἱ ", "", ""],
217 [("noun", "name"), r"(?m)^hai ", "", ""],
218 [("noun", "name"), r"^τῆς ", "", ""],
219 [("noun", "name"), r"(?m)^tês ", "", ""],
220 [("noun", "name"), r"^τῇ ", "", ""],
221 [("noun", "name"), r"(?m)^tēî ", "", ""],
222 [("noun", "name"), r"(?m)^têi ", "", ""],
223 [("noun", "name"), r"^ταῖς ", "", ""],
224 [("noun", "name"), r"(?m)^taîs ", "", ""],
225 [("noun", "name"), r"^τὴν ", "", ""],
226 [("noun", "name"), r"(?m)^tḕn ", "", ""],
227 [("noun", "name"), r"^τᾱ̀ς ", "", ""],
228 [("noun", "name"), r"(?m)^tā̀s ", "", ""],
229 [("noun", "name"), r"^ὁ / ἡ ", "", ""],
230 [("noun", "name"), r"(?m)^ho / hē ", "", ""],
231 [("noun", "name"), r"^οἱ / αἱ ", "", ""],
232 [("noun", "name"), r"(?m)^hoi / hai ", "", ""],
233 [("noun", "name"), r"^τοῦ / τῆς ", "", ""],
234 [("noun", "name"), r"(?m)^toû / tês ", "", ""],
235 [("noun", "name"), r"^τῷ / τῇ ", "", ""],
236 [("noun", "name"), r"(?m)^tōî / tēî ", "", ""],
237 [("noun", "name"), r"^τοῖς / ταῖς ", "", ""],
238 [("noun", "name"), r"(?m)^toîs / taîs ", "", ""],
239 [("noun", "name"), r"^τὸν / τὴν ", "", ""],
240 [("noun", "name"), r"(?m)^tòn / tḕn ", "", ""],
241 [("noun", "name"), r"^τοὺς / τᾱ̀ς ", "", ""],
242 [("noun", "name"), r"(?m)^toùs / tā̀s ", "", ""],
243 [("noun", "name"), r"^οἱ ", "", ""],
244 [("noun", "name"), r"(?m)^hoi ", "", ""],
245 [("noun", "name"), r"^τοῦ ", "", ""],
246 [("noun", "name"), r"(?m)^toû ", "", ""],
247 [("noun", "name"), r"^τῷ ", "", ""],
248 [("noun", "name"), r"(?m)^tôi ", "", ""],
249 [("noun", "name"), r"^τοῖς ", "", ""],
250 [("noun", "name"), r"(?m)^toîs ", "", ""],
251 [("noun", "name"), r"^τὸν ", "", ""],
252 [("noun", "name"), r"(?m)^tòn ", "", ""],
253 [("noun", "name"), r"^τοὺς ", "", ""],
254 [("noun", "name"), r"(?m)^toùs ", "", ""],
255 [("noun", "name"), r"^τὸ ", "", ""],
256 [("noun", "name"), r"(?m)^tò ", "", ""],
257 [("noun", "name"), r"^τᾰ̀ ", "", ""],
258 [("noun", "name"), r"(?m)^tằ ", "", ""],
259 [("noun", "name"), r"^τοῖσῐ / τοῖσῐν ", "", ""],
260 # XXX THIS IS BAD, IF POSSIBLE FIX, ISSUE #1313
261 [("noun", "name"), r"(?m)^toîsĭ\(n\) ", "", ""],
262 [("noun", "name"), r"(?m)^toîsĭ ", "", ""],
263 [("noun", "name"), r"(?m)^toîsĭn ", "", ""],
264 # END BAD
265 [("noun", "name"), r"^ᾱ̔ ", "", ""],
266 [("noun", "name"), r"(?m)^hā ", "", ""],
267 [("noun", "name"), r"^ταὶ ", "", ""],
268 [("noun", "name"), r"(?m)^taì ", "", ""],
269 [("noun", "name"), r"^τᾶς ", "", ""],
270 [("noun", "name"), r"(?m)^tâs ", "", ""],
271 [("noun", "name"), r"^τᾶν ", "", ""],
272 [("noun", "name"), r"(?m)^tân ", "", ""],
273 [("noun", "name"), r"^τᾷ ", "", ""],
274 [("noun", "name"), r"(?m)^tâi ", "", ""],
275 [("noun", "name"), r"^τᾱν ", "", ""],
276 [("noun", "name"), r"(?m)^tān ", "", ""],
277 [("noun", "name"), r"^τοὶ ", "", ""],
278 [("noun", "name"), r"(?m)^toì ", "", ""],
279 [("noun", "name"), r"^τῇσῐ ", "", ""],
280 [("noun", "name"), r"(?m)^têisĭ ", "", ""],
281 [("noun", "name"), r"^τῇσῐν ", "", ""],
282 [("noun", "name"), r"(?m)^têisĭ ", "", ""],
283 [("noun", "name"), r"^ὀ ", "", ""],
284 [("noun", "name"), r"(?m)^o ", "", ""],
285 [("noun", "name"), r"^οἰ ", "", ""],
286 [("noun", "name"), r"(?m)^oi ", "", ""],
287 [("noun", "name"), r"^τῶ ", "", ""],
288 [("noun", "name"), r"(?m)^tô ", "", ""],
289 [("noun", "name"), r"^ᾱ ", "", ""],
290 [("noun", "name"), r"(?m)^ā ", "", ""],
291 [("noun", "name"), r"^αἰ ", "", ""],
292 [("noun", "name"), r"(?m)^ai ", "", ""],
293 [("noun", "name"), r"^τᾶ ", "", ""],
294 [("noun", "name"), r"(?m)^tâ ", "", ""],
295 [("noun", "name"), r"^τᾱν ", "", ""],
296 [("noun", "name"), r"(?m)^tān ", "", ""],
297 [
298 ("noun", "name"),
299 r"^τοῖ ",
300 "",
301 "",
302 ], # alternative suggested by user
303 [("noun", "name"), r"^τοι ", "", ""],
304 [("noun", "name"), r"(?m)^toi ", "", ""],
305 [("noun", "name"), r"^τὼς ", "", ""],
306 [("noun", "name"), r"(?m)^tṑs ", "", ""],
307 [("noun", "name"), r"^τὸς ", "", ""],
308 [("noun", "name"), r"(?m)^tòs ", "", ""],
309 [("noun", "name"), r"^τὼς ", "", ""],
310 [("noun", "name"), r"(?m)^tṑs ", "", ""],
311 # [("noun", "name"), r"^", "", ""],
312 # [("noun", "name"), r"(?m)^", "", ""],
313 ],
314 },
315 # "Anejom̃": {
316 # "numbers": ["singular", "dual", "trial", "plural"],
317 # },
318 "Arabic": {
319 "next": "semitic-group",
320 "numbers": [
321 "singular",
322 "dual",
323 "paucal",
324 "plural",
325 "collective",
326 "singulative",
327 ],
328 "reuse_cellspan": "reuse",
329 "hdr_expand_first": set(["number"]),
330 "hdr_expand_cont": set(
331 ["gender", "referent", "misc", "number", "class"]
332 ),
333 },
334 "Aragonese": {
335 "next": "romance-group",
336 },
337 "Armenian": {
338 "lang_tag_mappings": {
339 "noun": {
340 ("possessive", "singular"): ["possessive", "possessed-single"],
341 ("possessive", "plural"): ["possessive", "possessed-single"],
342 },
343 },
344 },
345 "Aromanian": {
346 "next": "romance-group",
347 },
348 "Aramaic": {
349 "next": "semitic-group",
350 },
351 "Avestan": {
352 "next": "Proto-Indo-European",
353 },
354 "Bavarian": {
355 "next": "German",
356 },
357 "Baiso": {
358 "numbers": ["singular", "paucal", "plural"],
359 },
360 "Belarusian": {
361 "next": "slavic-group",
362 },
363 "Bende": {
364 "next": "bantu-group",
365 },
366 # "Berber": {
367 # "definitenesses": ["indefinite", "definite", "construct"],
368 # },
369 "Catalan": {
370 "next": "romance-group",
371 },
372 "Chichewa": {
373 "next": "bantu-group",
374 },
375 "Chimwiini": {
376 "next": "bantu-group",
377 },
378 "Cimbrian": {
379 "next": "German",
380 },
381 "Corsican": {
382 "next": "romance-group",
383 },
384 "Czech": {
385 "next": "slavic-group",
386 "hdr_expand_first": set(["tense", "mood", "non-finite"]),
387 "hdr_expand_cont": set(["tense", "mood", "voice"]),
388 },
389 "Dalmatian": {
390 "next": "romance-group",
391 },
392 "Danish": {
393 "genders": ["common-gender", "feminine", "masculine", "neuter"],
394 "form_transformations": [
395 ["noun", r"^\(as a measure\) ", "", ""],
396 ],
397 },
398 "Eblaite": {
399 "next": "semitic-group",
400 },
401 "Egyptian": {
402 "definitenesses": ["indefinite", "definite", "construct"],
403 },
404 "Emilian": {
405 "next": "romance-group",
406 },
407 "English": {
408 "stop_non_finite_tense": True, # affect/English/Verb
409 "form_transformations": [
410 ["verb", r"^\(to\) ", "", ""],
411 ["verb", "^to ", "", ""],
412 ["verb", r"^I ", "", "first-person singular"],
413 ["verb", r"^you ", "", "second-person"],
414 ["verb", r"^he ", "", "third-person singular"],
415 ["verb", r"^we ", "", "first-person plural"],
416 ["verb", r"^they ", "", "third-person"],
417 ["verb", r"^it ", "", "third-person singular"],
418 ["verb", r"^thou ", "", "second-person singular"],
419 ["verb", r"^ye ", "", "second-person plural"],
420 ["verb", r" \(thou\)$", "", "second-person singular"],
421 ["verb", r" \(ye\)$", "", "second-person plural"],
422 ["verb", r"^he/she/it ", "", "third-person singular"],
423 ["verb", r"^he/she/it/they ", "", "third-person singular"],
424 ["verb", r"\bhim/her/it/them ", "", "third-person singular"],
425 ["verb", r"\bthem ", "", "third-person"],
426 ["verb", r"\bus ", "", "first-person plural"],
427 ["verb", r"\bme ", "", "first-person singular"],
428 ],
429 "form_replacements": {
430 "let’s be": ["let's be", "first-person plural pronoun-included"],
431 },
432 "special_phrase_splits": {
433 "I am (’m)/be": [["am (’m)", "be"], "first-person singular"],
434 "we are (’re)/be/been": [
435 ["are (’re)", "be", "been"],
436 "first-person plural",
437 ],
438 "thou art (’rt)/beest": [
439 ["art (’rt)", "beest"],
440 "second-person singular",
441 ],
442 "ye are (’re)/be/been": [
443 ["are (’re)", "be", "been"],
444 "second-person plural",
445 ],
446 "thou be/beest": [["be", "beest"], "second-person singular"],
447 "he/she/it is (’s)/beeth/bes": [
448 ["is (’s)", "beeth", "bes"],
449 "third-person singular",
450 ],
451 "they are (’re)/be/been": [
452 ["are (’re)", "be", "been"],
453 "third-person plural",
454 ],
455 "thou wert/wast": [["wert", "wast"], "second-person singular"],
456 "thou were/wert": [["were", "wert"], "second-person singular"],
457 "there has been": [["there has been"], "singular"],
458 "there have been": [["there have been"], "plural"],
459 "there is ('s)": [["there is", "there's"], "singular"],
460 "there are ('re)": [["there are", "there're"], "plural"],
461 "there was": [["there was"], "singular"],
462 "there were": [["there were"], "plural"],
463 },
464 },
465 "Estonian": {
466 "hdr_expand_first": set(["non-finite"]),
467 "hdr_expand_cont": set(["voice"]),
468 },
469 "Faroese": {
470 "ignore_top_left_text_cell": True,
471 },
472 "Fijian": {
473 "numbers": ["singular", "paucal", "plural"],
474 },
475 "Finnish": {
476 "hdr_expand_first": set([]),
477 },
478 "French": {
479 "next": "romance-group",
480 },
481 "Friulian": {
482 "next": "romance-group",
483 },
484 "Galician": {
485 "next": "romance-group",
486 },
487 "German": {
488 "next": "german-group",
489 "form_transformations": [
490 ["verb", "^ich ", "", "first-person singular"],
491 ["verb", "^du ", "", "second-person singular"],
492 ["verb", "^er ", "", "third-person singular"],
493 ["verb", "^wir ", "", "first-person plural"],
494 ["verb", "^ihr ", "", "second-person plural"],
495 ["verb", "^sie ", "", "third-person plural"],
496 [
497 "verb",
498 "^dass ich ",
499 "",
500 "first-person singular subordinate-clause",
501 ],
502 [
503 "verb",
504 "^dass du ",
505 "",
506 "second-person singular subordinate-clause",
507 ],
508 [
509 "verb",
510 "^dass er ",
511 "",
512 "third-person singular subordinate-clause",
513 ],
514 [
515 "verb",
516 "^dass wir ",
517 "",
518 "first-person plural subordinate-clause",
519 ],
520 [
521 "verb",
522 "^dass ihr ",
523 "",
524 "second-person plural subordinate-clause",
525 ],
526 [
527 "verb",
528 "^dass sie ",
529 "",
530 "third-person plural subordinate-clause",
531 ],
532 ["verb", r" \(du\)$", "", "second-person singular"],
533 ["verb", r" \(ihr\)$", "", "second-person plural"],
534 ["adj", "^er ist ", "", "masculine singular"],
535 ["adj", "^sie ist ", "", "feminine singular"],
536 ["adj", "^es ist ", "", "neuter singular"],
537 ["adj", "^sie sind ", "", "plural"],
538 ["adj", "^keine ", "keine ", "negative"],
539 ["adj", "^keiner ", "keiner ", "negative"],
540 ["adj", "^keinen ", "keinen ", "negative"],
541 ],
542 "conditionally_ignored_cells": {
543 "definite": [
544 "der",
545 "die",
546 "das",
547 "des",
548 "dem",
549 "den",
550 ],
551 "indefinite": [
552 "ein",
553 "eine",
554 "eines",
555 "einer",
556 "einem",
557 "einen",
558 ],
559 "negative": [
560 "kein",
561 "keine",
562 "keiner",
563 "keinen",
564 ],
565 },
566 },
567 "German Low German": {
568 "next": "German",
569 "hdr_expand_first": set(["mood", "non-finite"]),
570 "hdr_expand_cont": set(["tense"]),
571 },
572 "Gothic": {
573 "next": "Proto-Indo-European", # Has dual
574 },
575 "Greek": {
576 "next": "indo-european-group",
577 "hdr_expand_first": set(["mood", "tense", "aspect", "dummy"]),
578 "hdr_expand_cont": set(["tense", "person", "number", "aspect"]),
579 "imperative_no_tense": True,
580 "reuse_cellspan": "reuse",
581 "skip_mood_mood": True,
582 "skip_tense_tense": True,
583 # είμαι/Greek
584 "parentheses_for_informal": True,
585 "square_brackets_for_rare": True,
586 "curly_brackets_for_archaic": True,
587 # For greek originally
588 "minor_text_cleanups": {
589 r"\s+➤\s*$": "",
590 },
591 },
592 "Hawaiian": {
593 "next": "austronesian-group",
594 },
595 "Hebrew": {
596 "next": "semitic-group",
597 },
598 "Hijazi Arabic": {
599 "next": "semitic-group",
600 },
601 "Hopi": {
602 "numbers": ["singular", "paucal", "plural"],
603 },
604 "Hungarian": {
605 "hdr_expand_first": set([]),
606 "hdr_expand_cont": set([]),
607 },
608 "Hunsrik": {
609 "next": "German",
610 },
611 "Icelandic": {
612 "ignore_top_left_text_cell": True,
613 },
614 "Ilokano": {
615 "next": "austronesian-group",
616 },
617 "Inari Sami": {
618 "next": "samojedic-group",
619 },
620 "Inuktitut": {
621 "numbers": ["singular", "dual", "plural"],
622 },
623 "Italian": {
624 "next": "romance-group",
625 "hdr_expand_first": set(["mood", "tense"]),
626 "hdr_expand_cont": set(["person", "register", "number", "misc"]),
627 "form_transformations": [
628 ["verb", "^non ", "", "negative"],
629 ],
630 },
631 "Irish": {
632 "next": "Old Irish",
633 "genders": ["masculine", "feminine"],
634 },
635 "Kamba": {
636 "next": "bantu-group",
637 },
638 "Kapampangan": {
639 "next": "austronesian-group",
640 },
641 # "Khoe": {
642 # "numbers": ["singular", "dual", "plural"],
643 # },
644 "Kikuyu": {
645 "next": "bantu-group",
646 },
647 "Ladin": {
648 "next": "romance-group",
649 },
650 # "Larike": {
651 # "numbers": ["singular", "dual", "trial", "plural"],
652 # },
653 "Latin": {
654 "next": "romance-group",
655 "stop_non_finite_voice": True,
656 },
657 "Latvian": {
658 "empty_row_resets": True,
659 },
660 "Ligurian": {
661 "next": "romance-group",
662 },
663 "Lihir": {
664 "numbers": ["singular", "dual", "trial", "paucal", "plural"],
665 },
666 "Lingala": {
667 "next": "bantu-group",
668 },
669 "Lombard": {
670 "next": "romance-group",
671 },
672 "Lower Sorbian": {
673 "next": "slavic-group",
674 },
675 "Luganda": {
676 "next": "bantu-group",
677 },
678 "Lule Sami": {
679 "next": "samojedic-group",
680 },
681 "Luxembourgish": {
682 "next": "German",
683 },
684 "Maltese": {
685 "next": "semitic-group",
686 },
687 "Maore Comorian": {
688 "next": "bantu-group",
689 },
690 "Masaba": {
691 "next": "bantu-group",
692 },
693 "Mirandese": {
694 "next": "romance-group",
695 },
696 "Moroccan Arabic": {
697 "next": "semitic-group",
698 },
699 # "Motuna": {
700 # "numbers": ["singular", "paucal", "plural"],
701 # },
702 "Mwali Comorian": {
703 "next": "bantu-group",
704 },
705 "Mwani": {
706 "next": "bantu-group",
707 },
708 "Navajo": {
709 "numbers": [
710 "singular",
711 "plural",
712 "dual",
713 "duoplural",
714 ],
715 },
716 "Neapolitan": {
717 "next": "romance-group",
718 },
719 "Nenets": {
720 "next": "uralic-group",
721 },
722 "Ngazidja Comorian": {
723 "next": "bantu-group",
724 },
725 "Niuean": {
726 "next": "austronesian-group",
727 },
728 "Northern Kurdish": {
729 "numbers": ["singular", "paucal", "plural"],
730 },
731 "Northern Ndebele": {
732 "next": "bantu-group",
733 },
734 "Northern Sami": {
735 "next": "samojedic-group",
736 },
737 # "Mussau": {
738 # "numbers": ["singular", "dual", "trial", "plural"],
739 # },
740 "Nyankole": {
741 "next": "bantu-group",
742 },
743 "Occitan": {
744 "next": "romance-group",
745 },
746 "Old Church Slavonic": {
747 "next": "Proto-Indo-European", # Has dual
748 },
749 "Old English": {
750 "next": "Proto-Indo-European", # Had dual in pronouns
751 },
752 "Old Norse": {
753 "next": "Proto-Indo-European", # Had dual in pronouns
754 },
755 "Old Irish": {
756 "next": "Proto-Indo-European", # Has dual
757 },
758 "Pennsylvania German": {
759 "next": "German",
760 },
761 "Phoenician": {
762 "next": "semitic-group",
763 },
764 "Phuthi": {
765 "next": "bantu-group",
766 },
767 "Pite Sami": {
768 "next": "samojedic-group",
769 },
770 "Polish": {
771 "next": "slavic-group",
772 },
773 "Portuguese": {
774 "next": "romance-group",
775 "genders": ["masculine", "feminine"],
776 },
777 "Proto-Germanic": {
778 "next": "Proto-Indo-European", # Has dual
779 },
780 "Proto-Indo-European": {
781 "numbers": ["singular", "dual", "plural"],
782 },
783 "Proto-Samic": {
784 "next": "samojedic-group",
785 },
786 "Proto-Uralic": {
787 "next": "uralic-group",
788 },
789 "Raga": {
790 "numbers": ["singular", "dual", "trial", "plural"],
791 },
792 "Romagnol": {
793 "next": "romance-group",
794 },
795 "Romanian": {
796 "next": "romance-group",
797 },
798 "Romansch": {
799 "next": "romance-group",
800 },
801 "Russian": {
802 "next": "slavic-group",
803 "hdr_expand_first": set(["non-finite", "mood", "tense"]),
804 "hdr_expand_cont": set(["tense", "number"]),
805 "reuse_cellspan": "stop",
806 },
807 "Rwanda-Rundi": {
808 "next": "bantu-group",
809 },
810 "Sanskrit": {
811 "next": "Proto-Indo-European",
812 },
813 "Sardinian": {
814 "next": "romance-group",
815 },
816 "Sassarese": {
817 "next": "romance-group",
818 },
819 "Scottish Gaelic": {
820 "numbers": ["singular", "dual", "plural"],
821 },
822 "Serbo-Croatian": {
823 "next": "slavic-group",
824 "numbers": ["singular", "dual", "paucal", "plural"],
825 },
826 "Sicilian": {
827 "next": "romance-group",
828 },
829 "Skolt Sami": {
830 "next": "samojedic-group",
831 },
832 "Slovene": {
833 "next": "slavic-group",
834 },
835 "Shona": {
836 "next": "bantu-group",
837 },
838 "Sotho": {
839 "next": "bantu-group",
840 },
841 "South Levantine Arabic": {
842 "next": "semitic-group",
843 },
844 "Southern Ndebele": {
845 "next": "bantu-group",
846 },
847 "Spanish": {
848 "next": "romance-group",
849 "form_transformations": [
850 ["verb", "^no ", "", "negative"],
851 ],
852 "special_references": {
853 "vos": "informal vos-form second-person singular",
854 "ᵛᵒˢ": "informal vos-form second-person singular",
855 "tú": "informal second-person singular",
856 },
857 },
858 "Swahili": {
859 "next": "bantu-group",
860 },
861 "Swedish": {
862 "hdr_expand_first": set(["referent"]),
863 "hdr_expand_cont": set(["degree", "polarity"]),
864 "genders": ["common-gender", "feminine", "masculine", "neuter"],
865 },
866 "Swazi": {
867 "next": "bantu-group",
868 },
869 # "Syriac": {
870 # "next": "semitic-group",
871 # },
872 "Tagalog": {
873 "next": "austronesian-group",
874 },
875 "Tausug": {
876 "next": "austronesian-group",
877 },
878 "Tigre": {
879 "next": "semitic-group",
880 },
881 "Tigrinya": {
882 "next": "semitic-group",
883 },
884 "Tongan": {
885 "next": "austronesian-group",
886 },
887 "Tsonga": {
888 "next": "bantu-group",
889 },
890 "Tswana": {
891 "next": "bantu-group",
892 },
893 "Tumbuka": {
894 "next": "bantu-group",
895 },
896 # "Tuscan": {
897 # "next": "romance-group",
898 # },
899 "Ugaritic": {
900 "next": "semitic-group",
901 },
902 "Ukrainian": {
903 "next": "slavic-group",
904 },
905 "Upper Sorbian": {
906 "next": "slavic-group",
907 },
908 # "Valencian": {
909 # "next": "romance-group",
910 # },
911 "Venetian": {
912 "next": "romance-group",
913 },
914 "Warlpiri": {
915 "numbers": ["singular", "paucal", "plural"],
916 },
917 "Xhosa": {
918 "next": "bantu-group",
919 },
920 "Zulu": {
921 "next": "bantu-group",
922 },
923 "ǃXóõ": {
924 "next": "bantu-group",
925 },
926}
929# Sanity check lang_specific
930# def_ls_keys = lang_specific["default"].keys()
931# for k, v in lang_specific.items():
932# if k[0].isupper() and k not in languages_by_name:
933# raise AssertionError(
934# "key {!r} in lang_specific is not a valid language"
935# .format(k))
936# assert isinstance(v, dict)
937# for kk, vv in v.items():
938# if kk not in def_ls_keys and kk != "next":
939# raise AssertionError("{} key {!r} not in default entry"
940# .format(k, kk))
941# if kk in ("hdr_expand_first", "hdr_expand_cont"):
942# if not isinstance(vv, set):
943# raise AssertionError("{} key {!r} must be set"
944# .format(lang, kk))
945# for t in vv:
946# if t not in tag_categories:
947# raise AssertionError("{} key {!r} invalid tag category {}"
948# .format(k, kk, t))
949# elif kk in ("genders", "numbers", "persons", "strengths", "voices"):
950# if not vv:
951# continue
952# if not isinstance(vv, (list, tuple, set)):
953# raise AssertionError("{} key {!r} must be list/tuple/set"
954# .format(k, kk))
955# for t in vv:
956# if t not in valid_tags:
957# raise AssertionError("{} key {!r} invalid tag {!r}"
958# .format(k, kk, t))
959# elif kk == "lang_tag_mappings" and vv is not None:
960# for pos, transf in vv.items():
961# assert pos in PARTS_OF_SPEECH
962# assert isinstance(transf, dict)
963# for pre, post in transf.items():
964# assert isinstance(pre, tuple)
965# assert all(t in valid_tags for t in pre)
966# assert isinstance(post, list)
967# assert all(t in valid_tags for t in post)
968# elif kk == "next":
969# if vv not in lang_specific:
970# raise AssertionError("{} key {!r} value {!r} is not defined"
971# .format(k, kk, vv))
974def get_lang_conf(lang, field):
975 """Returns the given field from language-specific data or "default"
976 if the language is not listed or does not have the field."""
977 assert isinstance(lang, str)
978 assert isinstance(field, str)
979 while True:
980 lconfigs = lang_specific.get(lang)
981 if lconfigs is None:
982 lang = "default"
983 elif lang == "default" and field not in lconfigs: 983 ↛ 984line 983 didn't jump to line 984 because the condition on line 983 was never true
984 raise RuntimeError("Invalid lang_specific field {!r}".format(field))
985 else:
986 if field in lconfigs:
987 return lconfigs[field]
988 lang = lconfigs.get("next", "default")
991def lang_specific_tags(lang, pos, form):
992 """Extracts tags from the word form itself in a language-specific way.
993 This may also adjust the word form.
994 For example, German inflected verb forms don't have person and number
995 specified in the table, but include a pronoun. This returns adjusted
996 form and a list of tags."""
997 assert isinstance(lang, str)
998 assert isinstance(pos, str)
999 assert isinstance(form, str)
1000 rules = get_lang_conf(lang, "form_transformations")
1001 for patpos, pattern, dst, tags in rules:
1002 # PoS, regex, replacement, tags; pattern -> dst :: "^ich " > ""
1003 if isinstance(patpos, tuple): 1003 ↛ 1004line 1003 didn't jump to line 1004 because the condition on line 1003 was never true
1004 for p in patpos:
1005 assert p in PARTS_OF_SPEECH
1006 if pos not in patpos:
1007 continue
1008 else:
1009 assert patpos in PARTS_OF_SPEECH
1010 if pos != patpos:
1011 continue
1012 m = re.search(pattern, form)
1013 if not m:
1014 continue
1015 form = form[: m.start()] + dst + form[m.end() :]
1016 tags = tags.split()
1017 for t in tags:
1018 assert t in valid_tags
1019 return form, tags
1020 return form, []