Coverage for src/wiktextract/extractor/pt/pos.py: 90%
105 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import (
4 HTMLNode,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .head_line import extract_head_line_nodes
14from .inflection import extract_flex_template
15from .models import Example, Linkage, Sense, WordEntry
16from .section_titles import POS_DATA
17from .tags import translate_raw_tags
20def extract_pos_section(
21 wxr: WiktextractContext,
22 page_data: list[WordEntry],
23 base_data: WordEntry,
24 level_node: LevelNode,
25 pos_title: str,
26 categories: list[str],
27) -> None:
28 page_data.append(base_data.model_copy(deep=True))
29 page_data[-1].pos_title = pos_title
30 pos_data = POS_DATA[pos_title.lower()]
31 page_data[-1].pos = pos_data["pos"]
32 page_data[-1].tags.extend(pos_data.get("tags", []))
33 page_data[-1].categories.extend(categories)
35 first_gloss_index = len(level_node.children)
36 for index, list_node in level_node.find_child(NodeKind.LIST, True):
37 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 37 ↛ 36line 37 didn't jump to line 36 because the condition on line 37 was always true
38 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
39 extract_gloss_list_item(wxr, page_data[-1], list_item)
40 if index < first_gloss_index: 40 ↛ 36line 40 didn't jump to line 36 because the condition on line 40 was always true
41 first_gloss_index = index
42 extract_head_line_nodes(
43 wxr, page_data[-1], level_node.children[:first_gloss_index]
44 )
45 # forms table template may not in header line
46 for t_node in level_node.find_child(NodeKind.TEMPLATE):
47 if t_node.template_name.startswith("flex."):
48 extract_flex_template(wxr, page_data[-1], t_node)
51def extract_gloss_list_item(
52 wxr: WiktextractContext,
53 word_entry: WordEntry | Linkage,
54 list_item: WikiNode,
55 parent_gloss: list[str] = [],
56) -> None:
57 gloss_nodes = []
58 sense = Sense(glosses=parent_gloss)
59 for node in list_item.children:
60 if isinstance(node, TemplateNode):
61 if node.template_name == "escopo":
62 extract_escopo_template(wxr, sense, node)
63 elif node.template_name == "escopo2": 63 ↛ 66line 63 didn't jump to line 66 because the condition on line 63 was always true
64 sense.raw_tags.extend(extract_escopo2_template(wxr, node))
65 else:
66 gloss_nodes.append(node)
67 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
68 if node.sarg.endswith(("*", ":")):
69 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
70 extract_example_list_item(wxr, sense, next_list_item)
71 else:
72 gloss_nodes.append(node)
74 gloss_str = clean_node(wxr, sense, gloss_nodes)
75 if len(gloss_str) > 0: 75 ↛ 80line 75 didn't jump to line 80 because the condition on line 75 was always true
76 sense.glosses.append(gloss_str)
77 translate_raw_tags(sense)
78 word_entry.senses.append(sense)
80 for child_list in list_item.find_child(NodeKind.LIST):
81 if child_list.sarg.endswith("#"):
82 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
83 extract_gloss_list_item(
84 wxr, word_entry, child_list_item, sense.glosses
85 )
88def extract_escopo_template(
89 wxr: WiktextractContext,
90 sense: Sense,
91 t_node: TemplateNode,
92) -> None:
93 # https://pt.wiktionary.org/wiki/Predefinição:escopo
94 expanded_str = clean_node(wxr, sense, t_node).strip("()")
95 for raw_tag in re.split(r", | e ", expanded_str):
96 if raw_tag.strip() != "": 96 ↛ 95line 96 didn't jump to line 95 because the condition on line 96 was always true
97 sense.raw_tags.append(raw_tag.strip())
100def extract_escopo2_template(
101 wxr: WiktextractContext,
102 t_node: TemplateNode,
103) -> list[str]:
104 # https://pt.wiktionary.org/wiki/Predefinição:escopo2
105 raw_tags = []
106 for arg in range(1, 4): 106 ↛ 112line 106 didn't jump to line 112 because the loop on line 106 didn't complete
107 if arg not in t_node.template_parameters:
108 break
109 raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
110 if raw_tag != "": 110 ↛ 106line 110 didn't jump to line 106 because the condition on line 110 was always true
111 raw_tags.append(raw_tag)
112 return raw_tags
115def extract_example_list_item(
116 wxr: WiktextractContext,
117 sense: Sense,
118 list_item: WikiNode,
119) -> None:
120 example = Example()
121 ref_nodes = []
123 for index, node in enumerate(list_item.children):
124 if (
125 isinstance(node, WikiNode)
126 and node.kind == NodeKind.ITALIC
127 and example.text == ""
128 ):
129 example.text = clean_node(wxr, None, node)
130 elif isinstance(node, HTMLNode) and node.tag == "small":
131 example.translation = clean_node(wxr, None, node)
132 if example.translation.startswith( 132 ↛ 135line 132 didn't jump to line 135 because the condition on line 132 was never true
133 "("
134 ) and example.translation.endswith(")"):
135 example.translation = example.translation.strip("()")
136 elif isinstance(node, TemplateNode):
137 match node.template_name:
138 case "OESP":
139 example.ref = clean_node(wxr, sense, node).strip("()")
140 case "tradex": 140 ↛ 148line 140 didn't jump to line 148 because the pattern on line 140 always matched
141 example.text = clean_node(
142 wxr, None, node.template_parameters.get(2, "")
143 )
144 example.translation = clean_node(
145 wxr, None, node.template_parameters.get(3, "")
146 )
147 clean_node(wxr, sense, node)
148 case "Ex.":
149 example.text = clean_node(
150 wxr, sense, node.template_parameters.get(1, "")
151 )
152 elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
153 bold_str = clean_node(wxr, None, node)
154 if re.fullmatch(r"\d+", bold_str) is not None: 154 ↛ 123line 154 didn't jump to line 123 because the condition on line 154 was always true
155 list_item_str = clean_node(
156 wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
157 )
158 if list_item_str.endswith(":"): 158 ↛ 123line 158 didn't jump to line 123 because the condition on line 158 was always true
159 ref_nodes.clear()
160 example.ref = list_item_str
161 for child_list in list_item.find_child(NodeKind.LIST):
162 for child_list_item in child_list.find_child(
163 NodeKind.LIST_ITEM
164 ):
165 example.text = clean_node(
166 wxr, None, child_list_item.children
167 )
168 break
169 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
170 ref_nodes.clear()
171 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
172 ref_nodes.append(child_list_item.children)
173 else:
174 ref_nodes.append(node)
176 if example.text != "": 176 ↛ exitline 176 didn't return from function 'extract_example_list_item' because the condition on line 176 was always true
177 if example.ref == "":
178 example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
179 sense.examples.append(example)