Coverage for src/wiktextract/extractor/fr/tags.py: 87%
42 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-05 00:35 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-05 00:35 +0000
1# Grammatical glossary appendix:
2# https://fr.wiktionary.org/wiki/Annexe:Glossaire_grammatical
3# List of templates:
4# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles
5from .models import WordEntry
7# https://en.wikipedia.org/wiki/Grammatical_gender
8GENDER_TAGS: dict[str, str | list[str]] = {
9 "commun": "common",
10 "féminin": "feminine",
11 "masculin": "masculine",
12 "neutre": "neuter",
13 # https://fr.wiktionary.org/wiki/Modèle:mf
14 "masculin et féminin identiques": ["masculine", "feminine"],
15 # table header: https://fr.wiktionary.org/wiki/Modèle:fr-rég
16 "masculin et féminin": ["masculine", "feminine"],
17 # "Modèle:mf ?", "Modèle:fm ?"
18 "masculin ou féminin (l’usage hésite)": ["masculine", "feminine"],
19 "féminin ou masculin (l’usage hésite)": ["feminine", "masculine"],
20 "invariable": "invariable", # Modèle:invar
21 # Modèle:flex-ku-nommixt
22 "masculin sing.": ["masculine", "singular"],
23 "féminin sing.": ["feminine", "singular"],
24 # Template:ja-flx-adj-な
25 "neutre négatif": ["neuter", "negative"],
26 "neutre passé": ["neuter", "past"],
27 "neutre négatif passé": ["neuter", "negative", "past"],
28 "poli négatif": ["polite", "negative"],
29 "poli passé": ["polite", "past"],
30 "poli négatif passé": ["polite", "negative", "past"],
31 # Template:m
32 "masculin animé": ["masculine", "animate"],
33 "masculin inanimé": ["masculine", "inanimate"],
34 # Template:f
35 "féminin animé": ["feminine", "animate"],
36 "féminin inanimé": ["feminine", "inanimate"],
37 # Template:n
38 "neutre animé": ["neuter", "animate"],
39 "neutre inanimé": ["neuter", "inanimate"],
40 # Template:fr-rég
41 "masculin\net féminin": ["masculine", "feminine"],
42}
44# https://en.wikipedia.org/wiki/Grammatical_number
45NUMBER_TAGS: dict[str, str | list[str]] = {
46 "singulier": "singular",
47 "pluriel": "plural",
48 "duel": "dual",
49 "collectif": "collective",
50 "singulatif": "singulative",
51 "indénombrable": "uncountable", # sv-nom-c-ind
52 "au singulier": "singular",
53 "au singulier uniquement": "singular-only",
54 "au pluriel": "plural",
55 "au pluriel uniquement": "plural-only",
56 "singulier et pluriel identiques": ["singular", "plural"],
57 "nom collectif": "collective",
58 # "générique": "", # Modèle:g
59 # "nom d'unité": "", # Modèle:nu
60 "généralement indénombrable": "uncountable",
61 "dénombrable": "countable",
62 # Modèle:br-nom
63 "pluriel 1": "plural",
64 "pluriel 2": "plural",
65 "pluriel 3": "plural",
66 "pluriel 4": "plural",
67 # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug
68 "1": "first-person",
69 "2": "second-person",
70 "3": "third-person",
71 "4": "fourth-person",
72 # Template:nl-conj-cons
73 # https://en.wikipedia.org/wiki/Dutch_grammar#Personal_pronouns
74 "ik": ["first-person", "singular"],
75 "jij": ["second-person", "singular", "informal"],
76 "hij, zij, het": "third-person",
77 "wij": ["first-person", "plural"],
78 "jullie": ["second-person", "plural", "informal"],
79 "zij": ["third-person", "plural"],
80 "u": "second-person",
81}
83# https://en.wikipedia.org/wiki/Grammatical_mood
84MOOD_TAGS: dict[str, str] = {
85 "indicatif": "indicative",
86 "subjonctif": "subjunctive",
87 "conditionnel": "conditional",
88 "impératif": "imperative",
89 "volitif": "volitive",
90 "déclaratif": "declarative",
91 "interrogatif": "interrogative",
92 "aperceptif": "apperceptive",
93 "euphémique": "euphemistic",
94 "évidentiel": "evidential",
95 "spéculatif": "speculative",
96 "assertif": "assertive",
97 "hortatif": "hortative",
98 "promissif": "promissive",
99 "conditionnel / subjonctif": ["conditional", "subjunctive"],
100 "conjonctif": "subjunctive",
101 "provisionnel": "temporal",
102 # Template:de-conj
103 "subjonctif i": "subjunctive-i",
104 "subjonctif ii": "subjunctive-ii",
105 "conjectural/volitif": ["conjectural", "volitive"],
106}
108VERB_FORM_TAGS: dict[str, str | list[str]] = {
109 "participe": "participle",
110 "imparfait": "imperfect",
111 # Template:ku-conj-trans
112 "parfait": "perfect",
113 "imparfait narratif": ["imperfect", "narrative"],
114 "infinitif": "infinitive",
115 "gérondif": "gerund",
116 # template "pt-verbe-flexion"
117 "infinitif personnel": ["infinitive", "personal"],
118 "supin": "supine",
119 # Template:ko-conj
120 "conjugaison": "conjugation",
121 "radical": "radical",
122 "formes finales": "final",
123 "registre formel": "formal",
124 "registre informel": "informal",
125 "non poli": "impolite",
126 "poli": "polite",
127 "formes nominales": "nominal",
128 "formes conjonctives": "subjunctive",
129 # Template:ja-在る
130 "formes de base": "base-form",
131 "affirmatif": "affirmative",
132 "négatif": "negative",
133 "adverbial": "adverbial",
134 # Template:bg-verbe186
135 "aoriste": "aorist",
136 "participe passé passif": ["participle", "past", "passive"],
137 "participe passé actif": ["participle", "past", "active"],
138 "participe imparfait": ["participle", "imperfect"],
139 "auxiliaire": "auxiliary",
140 "bitransitif": "ditransitive",
141 "déterminé": "determinate",
142 "indéterminé": "indeterminate",
143 # Template:irrégulier
144 "irrégulier": "irregular",
145}
147# https://en.wikipedia.org/wiki/Grammatical_case
148CASE_TAGS: dict[str, str | list[str]] = {
149 "ablatif": "ablative",
150 "accusatif": "accusative",
151 "accusatif génitif": ["accusative", "genitive"],
152 "nominatif": "nominative",
153 "datif": "dative",
154 "génitif": "genitive",
155 "vocatif": "vocative",
156 "instrumental": "instrumental",
157 "locatif": "locative",
158 "comitatif": "comitative",
159 "essif": "essive",
160 "illatif": "illative",
161 # Template:ro-nom-tab
162 "nominatif\naccusatif": ["nominative", "accusative"],
163 "datif\ngénitif": ["dative", "genitive"],
164 # Template:ko-nom
165 "nominatif / attributif": ["nominative", "attributive"],
166 # Modèle:fro-adj
167 "sujet": "subject",
168 "régime": "oblique",
169}
171# https://en.wikipedia.org/wiki/Grammatical_tense
172TENSE_TAGS: dict[str, str | list[str]] = {
173 "présent": "present",
174 "passé": "past",
175 "passé simple": "past",
176 "futur": "future",
177 "futur simple": "future",
178 # https://en.wikipedia.org/wiki/Passé_composé
179 "passé composé": ["past", "multiword-construction"],
180 "plus-que-parfait": "pluperfect",
181 "passé antérieur": ["past", "anterior"],
182 "futur antérieur": ["future", "perfect"],
183 "prétérit": "preterite",
184 "présent simple,\n3ᵉ pers. sing.": ["present", "third-person", "singular"],
185 "participe passé": ["participle", "past"],
186 "participe présent": ["participle", "present"],
187 # Template:ku-conj-trans
188 "présent progressif": ["present", "progressive"],
189 "prétérit et imparfait": ["preterite", "imperfect"],
190 "non passé": "non-past",
191 "présent / futur": ["present", "future"],
192 # Template:de-conj
193 "futur i": "future-i",
194 "futur ii": "future-ii",
195 # Template:it-irrégulier-avere-1
196 "présent affirmatif": ["present", "affirmative"],
197 "présent négatif": ["present", "negative"],
198}
200# https://en.wikipedia.org/wiki/Grammatical_person
201PERSON_TAGS: dict[str, str | list[str]] = {
202 "1ᵉ personne": "first-person",
203 "1ʳᵉ personne": "first-person",
204 "2ᵉ personne": "second-person",
205 "3ᵉ personne": "third-person",
206 # Modèle:avk-conj
207 "1ʳᵉ du sing.": ["first-person", "singular"],
208 "2ᵉ du sing.": ["second-person", "singular"],
209 "3ᵉ du sing.": ["third-person", "singular"],
210 "1ʳᵉ du plur.": ["first-person", "plural"],
211 "2ᵉ du plur.": ["second-person", "plural"],
212 "3ᵉ du plur.": ["third-person", "plural"],
213 "4ᵉ du plur.": ["fourth-person", "plural"],
214}
216SEMANTICS_TAGS: dict[str, str] = {
217 # https://en.wikipedia.org/wiki/Definiteness
218 "défini": "definite",
219 "indéfini": "indefinite",
220 # template:ro-nom-tab
221 "articulé": "definite",
222 "non articulé": "indefinite",
223}
225COMPARISON_TAGS: dict[str, str] = {
226 # https://en.wikipedia.org/wiki/Comparison_(grammar)
227 "positif": "positive",
228 "comparatif": "comparative",
229 "superlatif": "superlative",
230 "non comparable": "not-comparable",
231 "superlatif absolu": ["superlative", "absolute"],
232}
234# https://en.wikipedia.org/wiki/Occitan_language#Writing_system
235OCCITAN_NORM_TAGS: dict[str, str] = {
236 # https://fr.wiktionary.org/wiki/Modèle:oc-norme_mistralienne
237 "graphie mistralienne": "Mistralian",
238 # https://fr.wiktionary.org/wiki/Modèle:oc-norme_classique
239 # "graphie normalisée": "",
240 # Modèle:oc-norme bonnaudienne
241 # "graphie bonnaudienne": "",
242}
244# https://en.wikipedia.org/wiki/Breton_mutations
245# https://fr.wiktionary.org/wiki/Modèle:br-nom
246BRETON_MUTATION_TAGS: dict[str, str] = {
247 "non muté": "unmutated",
248 "adoucissante": "mutation-soft",
249 "durcissante": "mutation-hard",
250 "spirante": "mutation-spirant",
251 "nasale": "mutation-nasal",
252}
254JA_TAGS: dict[str, str] = {
255 # https://fr.wiktionary.org/wiki/Modèle:ja-trans
256 "kanji": "kanji",
257 "hiragana": "hiragana",
258 "katakana": "katakana",
259 "transcription": "transcription",
260}
262OTHER_GRAMMATICAL_TAGS: dict[str, str] = {
263 # https://fr.wiktionary.org/wiki/Modèle:be-tab-cas
264 "prépositionnel": "prepositional",
265 "anglicisme": "Anglicism",
266 "pronominal": "pronominal",
267 "diminutif": "diminutive",
268 "réfléchi": "reflexive", # Modèle:réfl
269 "réciproque": "reciprocal", # Modèle:réciproque
270 "impersonnel": "impersonal", # Modèle:impers
271 "transitif": "transitive", # Modèle:t
272 "transitif indirect": ["transitive", "indirect"], # Modèle:transitif indir
273 "intransitif": "intransitive", # Modèle:i
274 "injurieux": "offensive", # Modèle:injurieux
275 # Modèle:zh-formes
276 "simplifié": "Simplified-Chinese",
277 "traditionnel": "Traditional-Chinese",
278 # Modèle:flex-ku-nomf
279 "ézafé principal": ["ezafe", "primary"],
280 "ézafé secondaire": ["ezafe", "secondary"],
281 "cas oblique": "oblique",
282 # Modèle:ku-conj-trans
283 "forme affirmative": "affirmative",
284 "forme négative": "negative",
285 # Modèle:bg-nom
286 "forme de base": "base-form",
287 "pluriel numéral": ["plural", "numeral"],
288 "animé": "animate",
289 "inanimé": "inanimate",
290 # Template:ko-nom
291 "hangeul": "hangeul",
292 "hanja": "hanja",
293 "avec clitique": "clitic",
294 "indéclinable": "indeclinable",
295 "toponyme": "toponymic",
296 "applicatif": "applicative",
297 "causatif": "causative",
298 "sigle": "abbreviation",
299 "attributif": "attributive",
300 "prédicatif": "predicative",
301 # Template:cy-mut
302 "non muté": "unmutated",
303 "lénition": "lenition",
304 "nasalisation": "nasalization",
305 "syllabaire": "Syllabics",
306 "par ellipse": "ellipsis", # Template:ellipse
307 "ironique": "ironic",
308 "suffixe": "suffix",
309 # Template:avk-tab-conjug
310 "conjugaison présent indicatif": ["present", "indicative"],
311 # Modèle:de-adjectif-déclinaisons
312 "déclinaison forte": "strong",
313 "déclinaison faible": "weak",
314 "déclinaison mixte": "mixed",
315 "singulier / pluriel": ["singular", "plural"],
316 # Template:ja-する
317 "inaccompli": "uncompleted",
318 "imperfectif (未然形, mizen-kei)": "imperfective",
319 "continuatif (連用形, ren'yō-kei)": "continuative",
320 "conclusif (終止形, shūshi-kei)": "terminal",
321 "attributif (連体形, rentai-kei)": "attributive",
322 "hypothétique (仮定形, katei-kei)": "hypothetical",
323 "impératif (命令形, meirei-kei)": "imperative",
324 "forme en -te": "gerund",
325 "désidératif adjectif variable (flexions)": ["desiderative", "adjective"],
326 # Template:ja-flx-adj-な
327 "imperfectif (未然形)": "imperfective",
328 "continuatif (連用形)": "continuative",
329 "conclusif (終止形)": "terminal",
330 "attributif (連体形)": "attributive",
331 "hypothétique (仮定形)": "hypothetical",
332 "impératif (命令形)": "imperative",
333 # Template:ko-nom
334 "avec\nclitique": "clitic",
335 "thème": "stem",
336 "nominatif\n/ attributif": ["nominative", "attributive"],
337 "seulement": "exclusive",
338 # Template:pt-conj/*
339 "formas impessoais\n(formes impersonnelles)": "impersonal",
340 "infinitivo (infinitif)": "infinitive",
341 "gerúndio (gérondif)": "gerund",
342 "particípio (participe)": "participle",
343 "formas pessoais\n(formes personnelles)": "personal",
344 "singular (singulier)": "singular",
345 "plural (pluriel)": "plural",
346 "primeira (première)": "first-person",
347 "segunda (deuxième)": "second-person",
348 "terceira (troisième)": "third-person",
349 "infinitivo pessoal\n(infinitif personnel)": ["infinitive", "personal"],
350 "modo indicativo (indicatif)": "indicative",
351 "presente\n(présent)": "present",
352 "pretérito imperfeito\n(prétérit imparfait)": ["imperfect", "preterite"],
353 "pretérito perfeito\n(prétérit parfait)": ["perfect", "preterite"],
354 "pretérito mais-que-perfeito\n(prétérit plus-que-parfait)": [
355 "perfect",
356 "pluperfect",
357 ],
358 "futuro do presente\n(futur du présent)": "future",
359 "futuro do pretérito\n(futur du prétérit)": "conditional",
360 "modo subjuntivo (conjuntivo) (mode subjonctif)": "subjunctive",
361 "futuro\n(futur)": "future",
362 "mode imperativo (impératif)": "imperative",
363 "afirmativo\n(affirmatif)": "affirmative",
364 "negativo\n(négatif)": "negative",
365 "brésilien": "Brazilian",
366}
368# template text before gloss
369SENSE_TAGS: dict[str, str] = {
370 # https://fr.wiktionary.org/wiki/Modèle:figuré
371 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_relation_entre_les_définitions
372 # Catégorie:Modèles de genre textuel
373 # Catégorie:Modèles de registre
374 "sens figuré": "figuratively",
375 "sens propre": "literally",
376 "par métonymie": "metonymically", # Modèle:par métonymie
377 "par hyperbole": "hyperbole",
378 "par extension": "broadly",
379 "par analogie": "analogy",
380 "en particulier": "especially",
381 "par litote": "litotes",
382 "par euphémisme": "euphemism",
383 "spécifiquement": "specifically",
384 "génériquement": "generically",
385 "spécialement": "especially",
386 "généralement": "generally",
387 "enclise": "enclitic",
388 "idiotisme": "idiomatic",
389 "péjoratif": "pejorative",
390 "désuet": "obsolete",
391 "archaïsme": "archaic",
392 "vieilli": "dated",
393 "néologisme": "neologism",
394 "argot": "slang",
395 "rare": "rare",
396 # "plus rare": "rare",
397 "littéraire": "literary", # Modèle:littéraire
398 "poétique": "poetic", # Modèle:poétique
399 # "didactique": "", # Modèle:didactique
400 "soutenu": "formal", # Modèle:soutenu
401 "informel": "informal", # Modèle:informel
402 "familier": "familiar", # Modèle:familier
403 "très familier": "very-familiar", # Modèle:très familier
404 "populaire": "colloquial", # Modèle:populaire
405 "vulgaire": "vulgar", # Modèle:vulgaire
406 "langage enfantin": "childish", # Modèle:enfantin
407 # Catégorie:Modèles de thématique
408 "anglicisme informatique": "Anglicism",
409 "proverbe": "proverb",
410 "collectivement": "collectively",
411 "courant": "common", # Modèle:courant
412 "adjectif attribut": ["adjective", "attributive"],
413}
415# https://en.wikipedia.org/wiki/Voice_(grammar)
416VOICE_TAGS: dict[str, str | list[str]] = {
417 # https://fr.wiktionary.org/wiki/Modèle:eo-conj
418 "participe actif": ["participle", "active"],
419 "participe passif": ["participle", "passive"],
420 "adverbe actif": ["adverb", "active"],
421 "adverbe passif": ["adverb", "passive"],
422 "substantif\nactif": ["subsuntive", "active"],
423 "substantif\npassif": ["subsuntive", "passive"],
424 "actif": "active",
425 "passif": "passive",
426 "adverbe": "adverb",
427}
429# Module:lexique/data
430LEXIQUE_TAGS = {
431 "hindouisme": "Hinduism",
432 "judaïsme": "Judaism",
433 "marxisme": "Marxism",
434 "nazisme": "Nazism",
435 "physique": "physical",
436 "rhétorique": "rhetoric",
437 "antiquité": "Ancient",
438 "antiquité grecque": "Ancient-Greek",
439 "antiquité romaine": "Ancient-Roman",
440 "bible": "Biblical",
441 "moyen âge": "Middle-Ages",
442 "union européenne": "European-Union",
443 "analyse": "analytic",
444}
446# Template:cmn-pron
447# https://fr.wiktionary.org/wiki/自由
448ZH_PRON_TAGS = {
449 "pinyin": "Pinyin",
450 "efeo": "EFEO", # https://en.wikipedia.org/wiki/EFEO_Chinese_transcription
451 "wade-giles": "Wade-Giles",
452 "yale": "Yale",
453 "zhuyin": "Bopomofo",
454 "mandarin": "Mandarin",
455 "cantonais": "Cantonese",
456 "cantonais (yue)": "Cantonese",
457 "jyutping": "Jyutping",
458 "hakka": "Hakka",
459 "pha̍k-fa-sṳ": "Phak-fa-su",
460 "meixian, guangdong": ["Meixian", "Guangdong"],
461 "jin": "Jin",
462 "mindong": "Min-Dong",
463 # https://en.wikipedia.org/wiki/Bàng-uâ-cê
464 "bàng-uâ-cê (fuzhou)": ["Bang-ua-ce", "Fuzhou"],
465 "minnan": "Min-Nan",
466 "pe̍h-ōe-jī (hokkien : fujian, taïwan)": [
467 "Peh-oe-ji",
468 "Hokkien",
469 "Fujian",
470 "Taiwan",
471 ],
472 "chaozhou, peng'im": ["Chaozhou", "Peng'im"],
473 "wu": "Wu",
474 "shanghai": "Shanghai",
475 "chinois médiéval": "Medieval-Chinese",
476 "chinois archaïque": "Old-Chinese",
477 "baxter-sagart": "Baxter-Sagart",
478 "zhengzhang": "Zhengzhang",
479}
481ASPECT_TAGS = {
482 "perfectif": "perfective", # Modèle:perfectif
483 "imperfectif": "imperfective", # Modèle:imperfectif
484}
486GRAMMATICAL_TAGS: dict[str, str | list[str]] = {
487 **GENDER_TAGS,
488 **NUMBER_TAGS,
489 **MOOD_TAGS,
490 **VERB_FORM_TAGS,
491 **CASE_TAGS,
492 **TENSE_TAGS,
493 **PERSON_TAGS,
494 **SEMANTICS_TAGS,
495 **COMPARISON_TAGS,
496 **OCCITAN_NORM_TAGS,
497 **BRETON_MUTATION_TAGS,
498 **JA_TAGS,
499 **OTHER_GRAMMATICAL_TAGS,
500 **SENSE_TAGS,
501 **VOICE_TAGS,
502 **LEXIQUE_TAGS,
503 **ZH_PRON_TAGS,
504 **ASPECT_TAGS,
505}
508def translate_raw_tags(data: WordEntry) -> WordEntry:
509 from .topics import SLANG_TOPICS, TOPIC_TAGS
511 raw_tags = []
512 for raw_tag in data.raw_tags:
513 raw_tag_lower = raw_tag.lower()
514 if raw_tag_lower in GRAMMATICAL_TAGS:
515 tr_tag = GRAMMATICAL_TAGS[raw_tag_lower]
516 if isinstance(tr_tag, str) and tr_tag not in data.tags:
517 data.tags.append(tr_tag)
518 elif isinstance(tr_tag, list): 518 ↛ 512line 518 didn't jump to line 512 because the condition on line 518 was always true
519 for t in tr_tag:
520 if t not in data.tags: 520 ↛ 519line 520 didn't jump to line 519 because the condition on line 520 was always true
521 data.tags.append(t)
522 elif hasattr(data, "topics") and raw_tag_lower in TOPIC_TAGS:
523 data.topics.append(TOPIC_TAGS[raw_tag_lower])
524 elif hasattr(data, "topics") and raw_tag_lower in SLANG_TOPICS: 524 ↛ 525line 524 didn't jump to line 525 because the condition on line 524 was never true
525 data.topics.append(SLANG_TOPICS[raw_tag_lower])
526 if "slang" not in data.tags:
527 data.tags.append("slang")
528 else:
529 raw_tags.append(raw_tag)
530 data.raw_tags = raw_tags
531 return data