Coverage for src/wiktextract/extractor/it/pos.py: 77%
86 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..share import calculate_bold_offsets
6from .example import extract_example_list_item
7from .models import AltForm, Sense, WordEntry
8from .section_titles import POS_DATA
9from .tag_form_line import extract_tag_form_line_nodes
10from .tags import translate_raw_tags
12POS_SUBSECTION_TEMPLATES = frozenset(
13 [
14 # https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi
15 "-participio passato-",
16 "-participio presente-",
17 "Ausiliare",
18 "Deponente",
19 "Intransitivo",
20 "Medio",
21 "Passivo",
22 "Reciproco",
23 "Riflessivo",
24 "riflessivo",
25 "Transitivo",
26 # https://it.wiktionary.org/wiki/Categoria:Template_vocabolo
27 "Attivo",
28 "attivo",
29 "Inpr",
30 "inpr",
31 "Riflpr",
32 ]
33)
36def add_new_pos_data(
37 wxr: WiktextractContext,
38 page_data: list[WordEntry],
39 base_data: WordEntry,
40 level_node: LevelNode,
41 pos_title: str,
42) -> None:
43 page_data.append(base_data.model_copy(deep=True))
44 page_data[-1].pos_title = pos_title
45 if pos_title.startswith("Trascrizione"): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 pos_title = "Trascrizione"
47 pos_data = POS_DATA[pos_title]
48 page_data[-1].pos = pos_data["pos"]
49 page_data[-1].tags.extend(pos_data.get("tags", []))
50 for link_node in level_node.find_child(NodeKind.LINK):
51 clean_node(wxr, page_data[-1], link_node)
54def extract_pos_section(
55 wxr: WiktextractContext,
56 page_data: list[WordEntry],
57 base_data: WordEntry,
58 level_node: LevelNode,
59 pos_title: str,
60) -> None:
61 add_new_pos_data(wxr, page_data, base_data, level_node, pos_title)
62 last_gloss_list_index = 0
63 for index, node in enumerate(level_node.children):
64 if (
65 isinstance(node, WikiNode)
66 and node.kind == NodeKind.LIST
67 and node.sarg.startswith("#")
68 and node.sarg.endswith("#")
69 ):
70 for list_item in node.find_child(NodeKind.LIST_ITEM):
71 extract_gloss_list_item(wxr, page_data[-1], list_item)
72 extract_tag_form_line_nodes(
73 wxr,
74 page_data[-1],
75 level_node.children[last_gloss_list_index:index],
76 )
77 last_gloss_list_index = index + 1
78 elif (
79 isinstance(node, TemplateNode)
80 and node.template_name in POS_SUBSECTION_TEMPLATES
81 ):
82 if len(page_data[-1].senses) > 0:
83 add_new_pos_data(
84 wxr, page_data, base_data, level_node, pos_title
85 )
86 raw_tag = clean_node(wxr, page_data[-1], node).strip("= \n")
87 page_data[-1].raw_tags.append(raw_tag)
88 translate_raw_tags(page_data[-1])
91def extract_gloss_list_item(
92 wxr: WiktextractContext,
93 word_entry: WordEntry,
94 list_item: WikiNode,
95 parent_sense: Sense | None = None,
96) -> None:
97 gloss_nodes = []
98 sense = (
99 Sense() if parent_sense is None else parent_sense.model_copy(deep=True)
100 )
101 for node in list_item.children:
102 if isinstance(node, TemplateNode):
103 t_str = clean_node(wxr, sense, node)
104 if t_str.startswith("(") and t_str.endswith(")"):
105 sense.raw_tags.append(t_str.strip("()"))
106 else:
107 gloss_nodes.append(t_str)
108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
109 if (
110 node.sarg.endswith(":")
111 and len(sense.examples) > 0
112 and sense.examples[-1].translation == ""
113 ):
114 for tr_list_item in node.find_child(NodeKind.LIST_ITEM):
115 sense.examples[-1].translation = clean_node(
116 wxr, sense, tr_list_item.children
117 )
118 calculate_bold_offsets(
119 wxr,
120 tr_list_item,
121 sense.examples[-1].translation,
122 sense.examples[-1],
123 "bold_translation_offsets",
124 )
125 elif node.sarg.endswith(("*", ":")):
126 for example_list_item in node.find_child(NodeKind.LIST_ITEM):
127 extract_example_list_item(
128 wxr, sense, example_list_item, word_entry.lang_code
129 )
130 else:
131 gloss_nodes.append(node)
132 gloss_str = clean_node(wxr, sense, gloss_nodes)
133 if gloss_str != "": 133 ↛ 140line 133 didn't jump to line 140 because the condition on line 133 was always true
134 sense.glosses.append(gloss_str)
135 translate_raw_tags(sense)
136 if "form-of" in word_entry.tags:
137 extract_form_of_word(wxr, sense, list_item)
138 word_entry.senses.append(sense)
140 for list_node in list_item.find_child(NodeKind.LIST):
141 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
142 for child_list_item in list_node.find_child(NodeKind.LIST_ITEM):
143 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
146def extract_form_of_word(
147 wxr: WiktextractContext,
148 sense: Sense,
149 list_item: WikiNode,
150) -> None:
151 word = ""
152 for node in list_item.find_child(NodeKind.LINK):
153 word = clean_node(wxr, None, node)
154 if word != "": 154 ↛ exitline 154 didn't return from function 'extract_form_of_word' because the condition on line 154 was always true
155 sense.form_of.append(AltForm(word=word))
158def extract_note_section(
159 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
160) -> None:
161 notes = []
162 has_list = False
163 for list_node in level_node.find_child(NodeKind.LIST):
164 has_list = True
165 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
166 note = clean_node(wxr, None, list_item.children)
167 if note != "":
168 notes.append(note)
169 if not has_list:
170 note = clean_node(wxr, None, level_node.children)
171 if note != "":
172 notes.append(note)
174 for data in page_data:
175 if data.lang_code == page_data[-1].lang_code:
176 data.notes.extend(notes)