Coverage for src/wiktextract/extractor/pt/translation.py: 85%
124 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Translation, WordEntry
10def extract_translation_section(
11 wxr: WiktextractContext,
12 word_entry: WordEntry,
13 level_node: LevelNode,
14 title_text: str,
15) -> None:
16 sense = ""
17 sense_index = 0
18 target_field = "translations"
19 match title_text:
20 case "Cognatos": 20 ↛ 21line 20 didn't jump to line 21 because the pattern on line 20 never matched
21 target_field = "cognates"
22 case "Descendentes": 22 ↛ 23line 22 didn't jump to line 23 because the pattern on line 22 never matched
23 target_field = "descendants"
25 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
26 match node.kind:
27 case NodeKind.TEMPLATE:
28 if node.template_name == "tradini":
29 sense, sense_index = extract_tradini_template(wxr, node)
30 case NodeKind.LIST: 30 ↛ 25line 30 didn't jump to line 25 because the pattern on line 30 always matched
31 for list_item in node.find_child(NodeKind.LIST_ITEM):
32 extract_translation_list_item(
33 wxr,
34 word_entry,
35 list_item,
36 sense,
37 sense_index,
38 target_field,
39 )
42def extract_tradini_template(
43 wxr: WiktextractContext, t_node: TemplateNode
44) -> tuple[str, str]:
45 # https://pt.wiktionary.org/wiki/Predefinição:tradini
46 sense = ""
47 sense_index = 0
48 first_arg_str = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
49 m = re.match(r"De (\d+)", first_arg_str)
50 if m is not None: 50 ↛ 54line 50 didn't jump to line 54 because the condition on line 50 was always true
51 sense_index = int(m.group(1))
52 sense = first_arg_str[m.end() :].strip("() ")
53 else:
54 sense = first_arg_str
55 return sense, sense_index
58def extract_translation_list_item(
59 wxr: WiktextractContext,
60 word_entry: WordEntry,
61 list_item: WikiNode,
62 sense: str,
63 sense_index: int,
64 target_field: str,
65) -> None:
66 translations = []
67 lang_name = "unknown"
68 for node in list_item.children:
69 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
70 link_str = clean_node(wxr, None, node)
71 if "/traduções" in link_str or "/tradução" in link_str:
72 extract_translation_subpage(wxr, word_entry, link_str)
73 elif lang_name == "unknown": 73 ↛ 68line 73 didn't jump to line 68 because the condition on line 73 was always true
74 lang_name = link_str
75 elif isinstance(node, TemplateNode):
76 match node.template_name:
77 case "trad":
78 translations.extend(
79 extract_trad_template(wxr, node, sense, sense_index)
80 )
81 case "trad-":
82 translations.extend(
83 extract_trad_minus_template(
84 wxr, node, sense, sense_index
85 )
86 )
87 case "t":
88 translations.extend(
89 extract_t_template(wxr, node, sense, sense_index)
90 )
91 case "xlatio": 91 ↛ 68line 91 didn't jump to line 68 because the pattern on line 91 always matched
92 translations.extend(
93 extract_xlatio_template(
94 wxr,
95 node,
96 sense,
97 sense_index,
98 translations[-1].lang
99 if len(translations) > 0
100 else lang_name,
101 )
102 )
103 elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 roman = node.strip("() \n")
105 for tr_data in translations:
106 tr_data.roman = roman
107 elif (
108 isinstance(node, WikiNode)
109 and node.kind == NodeKind.ITALIC
110 and len(translations) > 0
111 ):
112 raw_tag = clean_node(wxr, None, node)
113 if raw_tag != "": 113 ↛ 68line 113 didn't jump to line 68 because the condition on line 113 was always true
114 translations[-1].raw_tags.append(raw_tag)
115 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
116 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
117 extract_translation_list_item(
118 wxr,
119 word_entry,
120 next_list_item,
121 sense,
122 sense_index,
123 target_field,
124 )
126 getattr(word_entry, target_field).extend(translations)
129def extract_trad_template(
130 wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
131) -> list[Translation]:
132 # https://pt.wiktionary.org/wiki/Predefinição:trad
133 translations = []
134 roman = clean_node(wxr, None, t_node.template_parameters.get("t", ""))
135 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
136 lang_name = "unknown"
137 expanded_node = wxr.wtp.parse(
138 wxr.wtp.node_to_wikitext(t_node), expand_all=True
139 )
140 for link_node in expanded_node.find_child(NodeKind.LINK): 140 ↛ 143line 140 didn't jump to line 143 because the loop on line 140 didn't complete
141 lang_name = clean_node(wxr, None, link_node)
142 break
143 for arg in range(2, 12): 143 ↛ 158line 143 didn't jump to line 158 because the loop on line 143 didn't complete
144 if arg not in t_node.template_parameters:
145 break
146 tr_str = clean_node(wxr, None, t_node.template_parameters.get(arg, ""))
147 if tr_str != "": 147 ↛ 143line 147 didn't jump to line 143 because the condition on line 147 was always true
148 translations.append(
149 Translation(
150 word=tr_str,
151 lang=lang_name,
152 lang_code=lang_code,
153 roman=roman,
154 sense=sense,
155 sense_index=sense_index,
156 )
157 )
158 return translations
161def extract_trad_minus_template(
162 wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
163) -> list[Translation]:
164 # https://pt.wiktionary.org/wiki/Predefinição:trad-
165 translations = []
166 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
167 lang_name = "unknown"
168 expanded_node = wxr.wtp.parse(
169 wxr.wtp.node_to_wikitext(t_node), expand_all=True
170 )
171 for link_node in expanded_node.find_child(NodeKind.LINK): 171 ↛ 174line 171 didn't jump to line 174 because the loop on line 171 didn't complete
172 lang_name = clean_node(wxr, None, link_node)
173 break
174 tr_data = Translation(
175 word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
176 lang=lang_name,
177 lang_code=lang_code,
178 roman=clean_node(
179 wxr, None, t_node.template_parameters.get(3, "")
180 ).strip("() "),
181 sense=sense,
182 sense_index=sense_index,
183 )
184 if tr_data.word != "": 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was always true
185 translations.append(tr_data)
186 return translations
189TRANSLATION_GENDER_TAGS = {
190 "c": "common",
191 "f": "feminine",
192 "m": "masculine",
193 "n": "neuter",
194}
197def extract_t_template(
198 wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
199) -> list[Translation]:
200 # https://pt.wiktionary.org/wiki/Predefinição:t
201 translations = []
202 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
203 lang_name = "unknown"
204 expanded_node = wxr.wtp.parse(
205 wxr.wtp.node_to_wikitext(t_node), expand_all=True
206 )
207 for link_node in expanded_node.find_child(NodeKind.LINK): 207 ↛ 210line 207 didn't jump to line 210 because the loop on line 207 didn't complete
208 lang_name = clean_node(wxr, None, link_node)
209 break
210 tr_data = Translation(
211 word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
212 lang=lang_name,
213 lang_code=lang_code,
214 roman=clean_node(
215 wxr, None, t_node.template_parameters.get(4, "")
216 ).strip("() "),
217 sense=sense,
218 sense_index=sense_index,
219 )
220 gender_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
221 if gender_arg in TRANSLATION_GENDER_TAGS: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 tr_data.tags.append(TRANSLATION_GENDER_TAGS[gender_arg])
223 if tr_data.word != "": 223 ↛ 225line 223 didn't jump to line 225 because the condition on line 223 was always true
224 translations.append(tr_data)
225 return translations
228def extract_xlatio_template(
229 wxr: WiktextractContext,
230 t_node: TemplateNode,
231 sense: str,
232 sense_index: int,
233 lang_name: str,
234) -> list[Translation]:
235 # https://pt.wiktionary.org/wiki/Predefinição:xlatio
236 translations = []
237 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
238 tr_data = Translation(
239 word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
240 lang=lang_name,
241 lang_code=lang_code,
242 sense=sense,
243 sense_index=sense_index,
244 )
245 third_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
246 if third_arg.strip(".") in TRANSLATION_GENDER_TAGS: 246 ↛ 247line 246 didn't jump to line 247 because the condition on line 246 was never true
247 tr_data.tags.append(TRANSLATION_GENDER_TAGS[third_arg.strip(".")])
248 else:
249 tr_data.roman = third_arg.strip("() ")
250 if tr_data.word != "": 250 ↛ 252line 250 didn't jump to line 252 because the condition on line 250 was always true
251 translations.append(tr_data)
252 return translations
255def extract_translation_subpage(
256 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
257) -> None:
258 page = wxr.wtp.get_page(page_title, 0)
259 if page is not None and page.body is not None: 259 ↛ exitline 259 didn't return from function 'extract_translation_subpage' because the condition on line 259 was always true
260 root = wxr.wtp.parse(page.body)
261 extract_translation_section(wxr, word_entry, root, "Tradução")