Coverage for src/wiktextract/extractor/tr/pos.py: 89%
162 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
3from wikitextprocessor import (
4 HTMLNode,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .example import extract_example_list_item
14from .models import AltForm, Example, Form, Sense, WordEntry
15from .section_titles import POS_DATA
16from .tags import translate_raw_tags
19def extract_pos_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24 pos_title: str,
25) -> None:
26 page_data.append(base_data.model_copy(deep=True))
27 page_data[-1].pos_title = pos_title
28 pos_data = POS_DATA[pos_title]
29 page_data[-1].pos = pos_data["pos"]
30 page_data[-1].tags.extend(pos_data.get("tags", []))
32 gloss_list_index = len(level_node.children)
33 for index, node in enumerate(level_node.children):
34 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
35 for list_item in node.find_child(NodeKind.LIST_ITEM):
36 if node.sarg == "#" or (
37 node.sarg == ":"
38 and len(list_item.children) > 0
39 and isinstance(list_item.children[0], str)
40 and re.search(r"\[\d+\]", list_item.children[0]) is not None
41 ):
42 extract_gloss_list_item(wxr, page_data[-1], list_item)
43 if index < gloss_list_index:
44 gloss_list_index = index
46 extract_pos_header_nodes(
47 wxr, page_data[-1], level_node.children[:gloss_list_index]
48 )
49 translate_raw_tags(page_data[-1])
52# https://tr.wiktionary.org/wiki/Kategori:Çekim_şablonları
53# https://tr.wiktionary.org/wiki/Kategori:Tanım_şablonları
54FORM_OF_TEMPLATES = {
55 "çekim",
56 "karşılaştırma",
57 "Komp.",
58 "artıklık",
59 "üstünlük",
60 "Sup.",
61 "tr-çekim",
62 "tr-çekim:m1",
63 "tr-ünlü-çekimi",
64 "ad-hâl",
65 "hâl",
66 "çoğul ad",
67 "çoğulu",
68 "çoğul isim",
69 "ota-çekim",
70 "ikil ad",
71 "ikil",
72 "çoğul kısaltma",
73 "el-ortaç çekimi",
74 "eylem-hâl",
75 "fiil",
76 "eylem",
77 "dişil tekili",
78 "dişil çoğulu",
79 "eril çoğulu",
80 "el-çekim:ος-η-ο",
81 "el-çekim:βιώνω",
82 "el-çekim:ος-α-ο",
83 "el-çekim:θεωρώ",
84 "el-çekim:ορίζω",
85 "yanlış yazım",
86 "doğrusu",
87 "Doğrusu",
88 "imla hatası",
89 "ön ad",
90 "sıfat",
91 "kısaltma",
92 "akronim",
93 "farklı",
94 "alternatif",
95 "kısa",
96 "mastarı",
97 "ar-mastarı",
98 "romanizasyon",
99}
102def extract_gloss_list_item(
103 wxr: WiktextractContext,
104 word_entry: WordEntry,
105 list_item: WikiNode,
106 parent_sense: Sense | None = None,
107) -> None:
108 sense = (
109 parent_sense.model_copy(deep=True)
110 if parent_sense is not None
111 else Sense()
112 )
113 gloss_nodes = []
114 for node in list_item.children:
115 if isinstance(node, TemplateNode) and node.template_name in [
116 "t",
117 "terim",
118 ]:
119 extract_terim_template(wxr, sense, node)
120 elif (
121 isinstance(node, TemplateNode)
122 and node.template_name in FORM_OF_TEMPLATES
123 ):
124 extract_form_of_template(wxr, word_entry, sense, node)
125 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
126 gloss_nodes.append(node)
128 gloss_str = clean_node(wxr, sense, gloss_nodes)
129 gloss_str = re.sub(r"^\[\d+\]\s*", "", gloss_str)
130 if gloss_str != "":
131 sense.glosses.append(gloss_str)
132 translate_raw_tags(sense)
133 word_entry.senses.append(sense)
135 for child_list in list_item.find_child(NodeKind.LIST):
136 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
137 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
138 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
139 elif child_list.sarg.startswith( 139 ↛ 135line 139 didn't jump to line 135 because the condition on line 139 was always true
140 ("#", ":")
141 ) and child_list.sarg.endswith((":", "*")):
142 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
143 example = Example(text="")
144 extract_example_list_item(
145 wxr, word_entry, child_list_item, example
146 )
147 if example.text != "":
148 sense.examples.append(example)
151def extract_terim_template(
152 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
153) -> None:
154 # https://tr.wiktionary.org/wiki/Şablon:terim
155 raw_tags_str = clean_node(wxr, sense, t_node).strip("() ")
156 for raw_tag in raw_tags_str.split(","):
157 raw_tag = raw_tag.strip()
158 if raw_tag not in ["", "'"]: 158 ↛ 156line 158 didn't jump to line 156 because the condition on line 158 was always true
159 sense.raw_tags.append(raw_tag)
162def extract_pos_header_nodes(
163 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
164) -> None:
165 for node in nodes:
166 if isinstance(node, TemplateNode) and (
167 node.template_name.startswith((word_entry.lang_code + "-"))
168 or node.template_name == "başlık başı"
169 ):
170 extract_pos_header_template(wxr, word_entry, node)
171 elif isinstance(node, TemplateNode) and node.template_name in [
172 "sahiplik",
173 "sahiplik eki",
174 "özel çoğul",
175 ]:
176 extract_sahiplik_template(wxr, word_entry, node)
179def extract_pos_header_template(
180 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
181) -> None:
182 # Şablon:başlık_başı, Şablon:tr-ad
183 expanded_node = wxr.wtp.parse(
184 wxr.wtp.node_to_wikitext(t_node), expand_all=True
185 )
186 raw_tags = []
187 last_italic_is_or = False
188 for node in expanded_node.children:
189 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
190 raw_tag = clean_node(wxr, None, node)
191 if raw_tag not in ["", "veya"]:
192 raw_tags.append(raw_tag)
193 last_italic_is_or = raw_tag == "veya"
194 elif isinstance(node, HTMLNode) and node.tag == "b":
195 word = clean_node(wxr, None, node)
196 if word != "": 196 ↛ 188line 196 didn't jump to line 188 because the condition on line 196 was always true
197 form = Form(form=word, raw_tags=raw_tags)
198 if last_italic_is_or:
199 form.raw_tags.extend(word_entry.forms[-1].raw_tags)
200 form.tags.extend(word_entry.forms[-1].tags)
201 translate_raw_tags(form)
202 word_entry.forms.append(form)
203 raw_tags.clear()
204 elif (
205 isinstance(node, HTMLNode)
206 and node.tag == "span"
207 and "gender" in node.attrs.get("class", "")
208 ):
209 for abbr_tag in node.find_html("abbr"):
210 gender_raw_tag = clean_node(wxr, None, abbr_tag)
211 if gender_raw_tag not in ["", "?"]: 211 ↛ 209line 211 didn't jump to line 209 because the condition on line 211 was always true
212 word_entry.raw_tags.append(gender_raw_tag)
213 elif (
214 isinstance(node, HTMLNode)
215 and node.tag == "strong"
216 and "headword" in node.attrs.get("class", "")
217 ):
218 form_str = clean_node(wxr, None, node)
219 if form_str not in ["", wxr.wtp.title]:
220 word_entry.forms.append(Form(form=form_str, tags=["canonical"]))
221 elif (
222 isinstance(node, HTMLNode)
223 and node.tag == "span"
224 and "headword-tr" in node.attrs.get("class", "")
225 ):
226 roman = clean_node(wxr, None, node)
227 if roman != "": 227 ↛ 188line 227 didn't jump to line 188 because the condition on line 227 was always true
228 word_entry.forms.append(
229 Form(form=roman, tags=["transliteration"])
230 )
232 clean_node(wxr, word_entry, expanded_node)
235# https://tr.wiktionary.org/wiki/Kategori:Tanım_şablonları
236BOLD_FORM_OF_TEMPLATE_TAGS = {
237 "akronim": "acronym",
238 "kısaltma": "abbreviation",
239 "kısa": "short-form",
240 "mastarı": "noun-from-verb",
241 "ar-mastarı": "noun-from-verb",
242}
243FORM_OF_TEMPLATE_TAGS = {
244 "romanizasyon": "romanization",
245 "yanlış yazım": "misspelling",
246 "doğrusu": "misspelling",
247 "Doğrusu": "misspelling",
248 "imla hatası": "misspelling",
249}
251ALT_OF_TEMPLATES = {
252 "farklı",
253 "alternatif",
254 "yanlış yazım",
255 "doğrusu",
256 "Doğrusu",
257 "imla hatası",
258}
261def extract_form_of_template(
262 wxr: WiktextractContext,
263 word_entry: WordEntry,
264 sense: Sense,
265 t_node: TemplateNode,
266) -> None:
267 # https://tr.wiktionary.org/wiki/Şablon:çekim
268 expanded_node = wxr.wtp.parse(
269 wxr.wtp.node_to_wikitext(t_node), expand_all=True
270 )
271 word = ""
272 if t_node.template_name in BOLD_FORM_OF_TEMPLATE_TAGS:
273 sense.tags.append(BOLD_FORM_OF_TEMPLATE_TAGS[t_node.template_name])
274 for bold_node in expanded_node.find_child(NodeKind.BOLD): 274 ↛ 290line 274 didn't jump to line 290 because the loop on line 274 didn't complete
275 word = clean_node(wxr, None, bold_node)
276 break
277 else:
278 if t_node.template_name in FORM_OF_TEMPLATE_TAGS: 278 ↛ 279line 278 didn't jump to line 279 because the condition on line 278 was never true
279 sense.tags.append(FORM_OF_TEMPLATE_TAGS[t_node.template_name])
280 for i_tag in expanded_node.find_html_recursively("i"):
281 word = clean_node(wxr, None, i_tag)
282 break
283 if word == "":
284 for link_node in expanded_node.find_child_recursively( 284 ↛ 290line 284 didn't jump to line 290 because the loop on line 284 didn't complete
285 NodeKind.LINK
286 ):
287 word = clean_node(wxr, None, link_node)
288 break
290 if word != "" and t_node.template_name in ALT_OF_TEMPLATES: 290 ↛ 291line 290 didn't jump to line 291 because the condition on line 290 was never true
291 sense.tags.append("alt-of")
292 sense.alt_of.append(AltForm(word=word))
293 elif word != "": 293 ↛ 297line 293 didn't jump to line 297 because the condition on line 293 was always true
294 sense.tags.append("form-of")
295 sense.form_of.append(AltForm(word=word))
297 clean_node(wxr, sense, expanded_node)
298 if expanded_node.contain_node(NodeKind.LIST):
299 for index, list_node in expanded_node.find_child( 299 ↛ exitline 299 didn't return from function 'extract_form_of_template' because the loop on line 299 didn't complete
300 NodeKind.LIST, with_index=True
301 ):
302 gloss = clean_node(wxr, None, expanded_node.children[:index])
303 if gloss != "": 303 ↛ 307line 303 didn't jump to line 307 because the condition on line 303 was always true
304 sense.glosses.append(gloss)
305 translate_raw_tags(sense)
306 word_entry.senses.append(sense)
307 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
308 extract_gloss_list_item(wxr, word_entry, list_item, sense)
309 break
310 else:
311 gloss = clean_node(wxr, None, expanded_node)
312 if gloss != "": 312 ↛ exitline 312 didn't return from function 'extract_form_of_template' because the condition on line 312 was always true
313 sense.glosses.append(gloss)
314 translate_raw_tags(sense)
315 word_entry.senses.append(sense)
318def extract_sahiplik_template(
319 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
320) -> None:
321 # https://tr.wiktionary.org/wiki/Şablon:sahiplik, Şablon:özel_çoğul
322 expanded_node = wxr.wtp.parse(
323 wxr.wtp.node_to_wikitext(t_node), expand_all=True
324 )
325 form = Form(form="")
326 for node in expanded_node.children:
327 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
328 raw_tag = clean_node(wxr, None, node)
329 if raw_tag != "": 329 ↛ 326line 329 didn't jump to line 326 because the condition on line 329 was always true
330 form.raw_tags.append(raw_tag)
331 elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
332 if t_node.template_name in ["sahiplik", "sahiplik eki"]:
333 for link_node in node.find_child(NodeKind.LINK):
334 if len(link_node.largs) > 0: 334 ↛ 333line 334 didn't jump to line 333 because the condition on line 334 was always true
335 form.form = clean_node(wxr, None, link_node.largs[0])
336 else:
337 form.form = clean_node(wxr, None, node)
338 if form.form != "": 338 ↛ exitline 338 didn't return from function 'extract_sahiplik_template' because the condition on line 338 was always true
339 translate_raw_tags(form)
340 word_entry.forms.append(form)
343def extract_note_section(
344 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
345) -> None:
346 for list_node in level_node.find_child(NodeKind.LIST):
347 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
348 note = clean_node(wxr, None, list_item.children)
349 if note != "":
350 word_entry.notes.append(note)