Coverage for src/wiktextract/extractor/fr/tags.py: 87%
42 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1# Grammatical glossary appendix:
2# https://fr.wiktionary.org/wiki/Annexe:Glossaire_grammatical
3# List of templates:
4# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles
5from .models import WordEntry
7# https://en.wikipedia.org/wiki/Grammatical_gender
8GENDER_TAGS: dict[str, str | list[str]] = {
9 "commun": "common",
10 "féminin": "feminine",
11 "masculin": "masculine",
12 "neutre": "neuter",
13 # https://fr.wiktionary.org/wiki/Modèle:mf
14 "masculin et féminin identiques": ["masculine", "feminine"],
15 # table header: https://fr.wiktionary.org/wiki/Modèle:fr-rég
16 "masculin et féminin": ["masculine", "feminine"],
17 # "Modèle:mf ?", "Modèle:fm ?"
18 "masculin ou féminin (l’usage hésite)": ["masculine", "feminine"],
19 "féminin ou masculin (l’usage hésite)": ["feminine", "masculine"],
20 "invariable": "invariable", # Modèle:invar
21 # Modèle:flex-ku-nommixt
22 "masculin sing.": ["masculine", "singular"],
23 "féminin sing.": ["feminine", "singular"],
24 # Template:ja-flx-adj-な
25 "neutre négatif": ["neuter", "negative"],
26 "neutre passé": ["neuter", "past"],
27 "neutre négatif passé": ["neuter", "negative", "past"],
28 "poli négatif": ["polite", "negative"],
29 "poli passé": ["polite", "past"],
30 "poli négatif passé": ["polite", "negative", "past"],
31 # Template:m
32 "masculin animé": ["masculine", "animate"],
33 "masculin inanimé": ["masculine", "inanimate"],
34 # Template:f
35 "féminin animé": ["feminine", "animate"],
36 "féminin inanimé": ["feminine", "inanimate"],
37 # Template:n
38 "neutre animé": ["neuter", "animate"],
39 "neutre inanimé": ["neuter", "inanimate"],
40}
42# https://en.wikipedia.org/wiki/Grammatical_number
43NUMBER_TAGS: dict[str, str | list[str]] = {
44 "singulier": "singular",
45 "pluriel": "plural",
46 "duel": "dual",
47 "collectif": "collective",
48 "singulatif": "singulative",
49 "indénombrable": "uncountable", # sv-nom-c-ind
50 "au singulier": "singular",
51 "au singulier uniquement": "singular-only",
52 "au pluriel": "plural",
53 "au pluriel uniquement": "plural-only",
54 "singulier et pluriel identiques": ["singular", "plural"],
55 "nom collectif": "collective",
56 # "générique": "", # Modèle:g
57 # "nom d'unité": "", # Modèle:nu
58 "généralement indénombrable": "uncountable",
59 "dénombrable": "countable",
60 # Modèle:br-nom
61 "pluriel 1": "plural",
62 "pluriel 2": "plural",
63 "pluriel 3": "plural",
64 "pluriel 4": "plural",
65 # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug
66 "1": "first-person",
67 "2": "second-person",
68 "3": "third-person",
69 "4": "fourth-person",
70 # Template:nl-conj-cons
71 # https://en.wikipedia.org/wiki/Dutch_grammar#Personal_pronouns
72 "ik": ["first-person", "singular"],
73 "jij": ["second-person", "singular", "informal"],
74 "hij, zij, het": "third-person",
75 "wij": ["first-person", "plural"],
76 "jullie": ["second-person", "plural", "informal"],
77 "zij": ["third-person", "plural"],
78 "u": "second-person",
79}
81# https://en.wikipedia.org/wiki/Grammatical_mood
82MOOD_TAGS: dict[str, str] = {
83 "indicatif": "indicative",
84 "subjonctif": "subjunctive",
85 "conditionnel": "conditional",
86 "impératif": "imperative",
87 "volitif": "volitive",
88 "déclaratif": "declarative",
89 "interrogatif": "interrogative",
90 "aperceptif": "apperceptive",
91 "euphémique": "euphemistic",
92 "évidentiel": "evidential",
93 "spéculatif": "speculative",
94 "assertif": "assertive",
95 "hortatif": "hortative",
96 "promissif": "promissive",
97 "conditionnel / subjonctif": ["conditional", "subjunctive"],
98 "conjonctif": "subjunctive",
99 "provisionnel": "temporal",
100 # Template:de-conj
101 "subjonctif i": "subjunctive-i",
102 "subjonctif ii": "subjunctive-ii",
103}
105VERB_FORM_TAGS: dict[str, str | list[str]] = {
106 "participe": "participle",
107 "imparfait": "imperfect",
108 # Template:ku-conj-trans
109 "parfait": "perfect",
110 "imparfait narratif": ["imperfect", "narrative"],
111 "infinitif": "infinitive",
112 "gérondif": "gerund",
113 # template "pt-verbe-flexion"
114 "infinitif personnel": ["infinitive", "personal"],
115 "supin": "supine",
116 # Template:ko-conj
117 "conjugaison": "conjugation",
118 "radical": "radical",
119 "formes finales": "final",
120 "registre formel": "formal",
121 "registre informel": "informal",
122 "non poli": "impolite",
123 "poli": "polite",
124 "formes nominales": "nominal",
125 "formes conjonctives": "subjunctive",
126 # Template:ja-在る
127 "formes de base": "base-form",
128 "affirmatif": "affirmative",
129 "négatif": "negative",
130 "adverbial": "adverbial",
131 # Template:bg-verbe186
132 "aoriste": "aorist",
133 "participe passé passif": ["participle", "past", "passive"],
134 "participe passé actif": ["participle", "past", "active"],
135 "participe imparfait": ["participle", "imperfect"],
136 "auxiliaire": "auxiliary",
137 "bitransitif": "ditransitive",
138 "déterminé": "determinate",
139 "indéterminé": "indeterminate",
140}
142# https://en.wikipedia.org/wiki/Grammatical_case
143CASE_TAGS: dict[str, str | list[str]] = {
144 "ablatif": "ablative",
145 "accusatif": "accusative",
146 "accusatif génitif": ["accusative", "genitive"],
147 "nominatif": "nominative",
148 "datif": "dative",
149 "génitif": "genitive",
150 "vocatif": "vocative",
151 "instrumental": "instrumental",
152 "locatif": "locative",
153 "comitatif": "comitative",
154 "essif": "essive",
155 "illatif": "illative",
156 # Template:ro-nom-tab
157 "nominatif\naccusatif": ["nominative", "accusative"],
158 "datif\ngénitif": ["dative", "genitive"],
159 # Template:ko-nom
160 "nominatif / attributif": ["nominative", "attributive"],
161 # Modèle:fro-adj
162 "sujet": "subject",
163 "régime": "oblique",
164}
166# https://en.wikipedia.org/wiki/Grammatical_tense
167TENSE_TAGS: dict[str, str | list[str]] = {
168 "présent": "present",
169 "passé": "past",
170 "passé simple": "past",
171 "futur": "future",
172 "futur simple": "future",
173 # https://en.wikipedia.org/wiki/Passé_composé
174 "passé composé": ["past", "multiword-construction"],
175 "plus-que-parfait": "pluperfect",
176 "passé antérieur": ["past", "anterior"],
177 "futur antérieur": ["future", "perfect"],
178 "prétérit": "preterite",
179 "présent simple, 3ᵉ pers. sing.": ["present", "third-person", "singular"],
180 "participe passé": ["participle", "past"],
181 "participe présent": ["participle", "present"],
182 # Template:ku-conj-trans
183 "présent progressif": ["present", "progressive"],
184 "prétérit et imparfait": ["preterite", "imperfect"],
185 "non passé": "non-past",
186 "présent / futur": ["present", "future"],
187 # Template:de-conj
188 "futur i": "future-i",
189 "futur ii": "future-ii",
190}
192# https://en.wikipedia.org/wiki/Grammatical_person
193PERSON_TAGS: dict[str, str | list[str]] = {
194 "1ᵉ personne": "first-person",
195 "1ʳᵉ personne": "first-person",
196 "2ᵉ personne": "second-person",
197 "3ᵉ personne": "third-person",
198 # Modèle:avk-conj
199 "1ʳᵉ du sing.": ["first-person", "singular"],
200 "2ᵉ du sing.": ["second-person", "singular"],
201 "3ᵉ du sing.": ["third-person", "singular"],
202 "1ʳᵉ du plur.": ["first-person", "plural"],
203 "2ᵉ du plur.": ["second-person", "plural"],
204 "3ᵉ du plur.": ["third-person", "plural"],
205 "4ᵉ du plur.": ["fourth-person", "plural"],
206}
208SEMANTICS_TAGS: dict[str, str] = {
209 # https://en.wikipedia.org/wiki/Definiteness
210 "défini": "definite",
211 "indéfini": "indefinite",
212 # template:ro-nom-tab
213 "articulé": "definite",
214 "non articulé": "indefinite",
215}
217COMPARISON_TAGS: dict[str, str] = {
218 # https://en.wikipedia.org/wiki/Comparison_(grammar)
219 "positif": "positive",
220 "comparatif": "comparative",
221 "superlatif": "superlative",
222 "non comparable": "not-comparable",
223 "superlatif absolu": ["superlative", "absolute"],
224}
226# https://en.wikipedia.org/wiki/Occitan_language#Writing_system
227OCCITAN_NORM_TAGS: dict[str, str] = {
228 # https://fr.wiktionary.org/wiki/Modèle:oc-norme_mistralienne
229 "graphie mistralienne": "Mistralian",
230 # https://fr.wiktionary.org/wiki/Modèle:oc-norme_classique
231 # "graphie normalisée": "",
232 # Modèle:oc-norme bonnaudienne
233 # "graphie bonnaudienne": "",
234}
236# https://en.wikipedia.org/wiki/Breton_mutations
237# https://fr.wiktionary.org/wiki/Modèle:br-nom
238BRETON_MUTATION_TAGS: dict[str, str] = {
239 "non muté": "unmutated",
240 "adoucissante": "mutation-soft",
241 "durcissante": "mutation-hard",
242 "spirante": "mutation-spirant",
243 "nasale": "mutation-nasal",
244}
246JA_TAGS: dict[str, str] = {
247 # https://fr.wiktionary.org/wiki/Modèle:ja-trans
248 "kanji": "kanji",
249 "hiragana": "hiragana",
250 "katakana": "katakana",
251 "transcription": "transcription",
252}
254OTHER_GRAMMATICAL_TAGS: dict[str, str] = {
255 # https://fr.wiktionary.org/wiki/Modèle:be-tab-cas
256 "prépositionnel": "prepositional",
257 "anglicisme": "Anglicism",
258 "pronominal": "pronominal",
259 "diminutif": "diminutive",
260 "réfléchi": "reflexive", # Modèle:réfl
261 "réciproque": "reciprocal", # Modèle:réciproque
262 "impersonnel": "impersonal", # Modèle:impers
263 "transitif": "transitive", # Modèle:t
264 "transitif indirect": ["transitive", "indirect"], # Modèle:transitif indir
265 "intransitif": "intransitive", # Modèle:i
266 "injurieux": "offensive", # Modèle:injurieux
267 # Modèle:zh-formes
268 "simplifié": "Simplified-Chinese",
269 "traditionnel": "Traditional-Chinese",
270 # Modèle:flex-ku-nomf
271 "ézafé principal": ["ezafe", "primary"],
272 "ézafé secondaire": ["ezafe", "secondary"],
273 "cas oblique": "oblique",
274 # Modèle:ku-conj-trans
275 "forme affirmative": "affirmative",
276 "forme négative": "negative",
277 # Modèle:bg-nom
278 "forme de base": "base-form",
279 "pluriel numéral": ["plural", "numeral"],
280 "animé": "animate",
281 "inanimé": "inanimate",
282 # Template:ko-nom
283 "hangeul": "hangeul",
284 "hanja": "hanja",
285 "avec clitique": "clitic",
286 "indéclinable": "indeclinable",
287 "toponyme": "toponymic",
288 "applicatif": "applicative",
289 "causatif": "causative",
290 "sigle": "abbreviation",
291 "attributif": "attributive",
292 "prédicatif": "predicative",
293 # Template:cy-mut
294 "non muté": "unmutated",
295 "lénition": "lenition",
296 "nasalisation": "nasalization",
297 "syllabaire": "Syllabics",
298 "par ellipse": "ellipsis", # Template:ellipse
299 "ironique": "ironic",
300 "suffixe": "suffix",
301 # Template:avk-tab-conjug
302 "conjugaison présent indicatif": ["present", "indicative"],
303 # Modèle:de-adjectif-déclinaisons
304 "déclinaison forte": "strong",
305 "déclinaison faible": "weak",
306 "déclinaison mixte": "mixed",
307 "singulier / pluriel": ["singular", "plural"],
308}
310# template text before gloss
311SENSE_TAGS: dict[str, str] = {
312 # https://fr.wiktionary.org/wiki/Modèle:figuré
313 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_relation_entre_les_définitions
314 # Catégorie:Modèles de genre textuel
315 # Catégorie:Modèles de registre
316 "sens figuré": "figuratively",
317 "sens propre": "literally",
318 "par métonymie": "metonymically", # Modèle:par métonymie
319 "par hyperbole": "hyperbole",
320 "par extension": "broadly",
321 "par analogie": "analogy",
322 "en particulier": "especially",
323 "par litote": "litotes",
324 "par euphémisme": "euphemism",
325 "spécifiquement": "specifically",
326 "génériquement": "generically",
327 "spécialement": "especially",
328 "généralement": "generally",
329 "enclise": "enclitic",
330 "idiotisme": "idiomatic",
331 "péjoratif": "pejorative",
332 "désuet": "obsolete",
333 "archaïsme": "archaic",
334 "vieilli": "dated",
335 "néologisme": "neologism",
336 "argot": "slang",
337 "rare": "rare",
338 # "plus rare": "rare",
339 "littéraire": "literary", # Modèle:littéraire
340 "poétique": "poetic", # Modèle:poétique
341 # "didactique": "", # Modèle:didactique
342 "soutenu": "formal", # Modèle:soutenu
343 "informel": "informal", # Modèle:informel
344 "familier": "familiar", # Modèle:familier
345 "très familier": "very-familiar", # Modèle:très familier
346 "populaire": "colloquial", # Modèle:populaire
347 "vulgaire": "vulgar", # Modèle:vulgaire
348 "langage enfantin": "childish", # Modèle:enfantin
349 # Catégorie:Modèles de thématique
350 "anglicisme informatique": "Anglicism",
351 "proverbe": "proverb",
352 "collectivement": "collectively",
353 "courant": "common", # Modèle:courant
354 "adjectif attribut": ["adjective", "attributive"],
355}
357# https://en.wikipedia.org/wiki/Voice_(grammar)
358VOICE_TAGS: dict[str, str | list[str]] = {
359 # https://fr.wiktionary.org/wiki/Modèle:eo-conj
360 "participe actif": ["participle", "active"],
361 "participe passif": ["participle", "passive"],
362 "adverbe actif": ["adverb", "active"],
363 "adverbe passif": ["adverb", "passive"],
364 "substantif\nactif": ["subsuntive", "active"],
365 "substantif\npassif": ["subsuntive", "passive"],
366 "actif": "active",
367 "passif": "passive",
368 "adverbe": "adverb",
369}
371# Module:lexique/data
372LEXIQUE_TAGS = {
373 "hindouisme": "Hinduism",
374 "judaïsme": "Judaism",
375 "marxisme": "Marxism",
376 "nazisme": "Nazism",
377 "physique": "physical",
378 "rhétorique": "rhetoric",
379 "antiquité": "Ancient",
380 "antiquité grecque": "Ancient-Greek",
381 "antiquité romaine": "Ancient-Roman",
382 "bible": "Biblical",
383 "moyen âge": "Middle-Ages",
384 "union européenne": "European-Union",
385 "analyse": "analytic",
386}
388# Template:cmn-pron
389# https://fr.wiktionary.org/wiki/自由
390ZH_PRON_TAGS = {
391 "pinyin": "Pinyin",
392 "efeo": "EFEO", # https://en.wikipedia.org/wiki/EFEO_Chinese_transcription
393 "wade-giles": "Wade-Giles",
394 "yale": "Yale",
395 "zhuyin": "Bopomofo",
396 "mandarin": "Mandarin",
397 "cantonais": "Cantonese",
398 "cantonais (yue)": "Cantonese",
399 "jyutping": "Jyutping",
400 "hakka": "Hakka",
401 "pha̍k-fa-sṳ": "Phak-fa-su",
402 "meixian, guangdong": ["Meixian", "Guangdong"],
403 "jin": "Jin",
404 "mindong": "Eastern-Min",
405 # https://en.wikipedia.org/wiki/Bàng-uâ-cê
406 "bàng-uâ-cê (fuzhou)": ["Bang-ua-ce", "Fuzhou"],
407 "minnan": "Min",
408 "pe̍h-ōe-jī (hokkien : fujian, taïwan)": [
409 "Peh-oe-ji",
410 "Hokkien",
411 "Fujian",
412 "Taiwan",
413 ],
414 "chaozhou, peng'im": ["Chaozhou", "Peng'im"],
415 "wu": "Wu",
416 "shanghai": "Shanghai",
417 "chinois médiéval": "Medieval-Chinese",
418 "chinois archaïque": "Old-Chinese",
419 "baxter-sagart": "Baxter-Sagart",
420 "zhengzhang": "Zhengzhang",
421}
423ASPECT_TAGS = {
424 "perfectif": "perfective", # Modèle:perfectif
425 "imperfectif": "imperfective", # Modèle:imperfectif
426}
428GRAMMATICAL_TAGS: dict[str, str | list[str]] = {
429 **GENDER_TAGS,
430 **NUMBER_TAGS,
431 **MOOD_TAGS,
432 **VERB_FORM_TAGS,
433 **CASE_TAGS,
434 **TENSE_TAGS,
435 **PERSON_TAGS,
436 **SEMANTICS_TAGS,
437 **COMPARISON_TAGS,
438 **OCCITAN_NORM_TAGS,
439 **BRETON_MUTATION_TAGS,
440 **JA_TAGS,
441 **OTHER_GRAMMATICAL_TAGS,
442 **SENSE_TAGS,
443 **VOICE_TAGS,
444 **LEXIQUE_TAGS,
445 **ZH_PRON_TAGS,
446 **ASPECT_TAGS,
447}
450def translate_raw_tags(data: WordEntry) -> WordEntry:
451 from .topics import SLANG_TOPICS, TOPIC_TAGS
453 raw_tags = []
454 for raw_tag in data.raw_tags:
455 raw_tag_lower = raw_tag.lower()
456 if raw_tag_lower in GRAMMATICAL_TAGS:
457 tr_tag = GRAMMATICAL_TAGS[raw_tag_lower]
458 if isinstance(tr_tag, str):
459 data.tags.append(tr_tag)
460 elif isinstance(tr_tag, list): 460 ↛ 454line 460 didn't jump to line 454 because the condition on line 460 was always true
461 for t in tr_tag:
462 if t not in data.tags: 462 ↛ 461line 462 didn't jump to line 461 because the condition on line 462 was always true
463 data.tags.append(t)
464 elif hasattr(data, "topics") and raw_tag_lower in TOPIC_TAGS:
465 data.topics.append(TOPIC_TAGS[raw_tag_lower])
466 elif hasattr(data, "topics") and raw_tag_lower in SLANG_TOPICS: 466 ↛ 467line 466 didn't jump to line 467 because the condition on line 466 was never true
467 data.topics.append(SLANG_TOPICS[raw_tag_lower])
468 if "slang" not in data.tags:
469 data.tags.append("slang")
470 else:
471 raw_tags.append(raw_tag)
472 data.raw_tags = raw_tags
473 return data