Coverage for src/wiktextract/extractor/pt/pos.py: 94%
89 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .example import extract_example_list_item
14from .head_line import extract_head_line_nodes
15from .inflection import extract_flex_template
16from .models import AltForm, Linkage, Sense, WordEntry
17from .section_titles import POS_DATA
18from .tags import translate_raw_tags
21def extract_pos_section(
22 wxr: WiktextractContext,
23 page_data: list[WordEntry],
24 base_data: WordEntry,
25 level_node: LevelNode,
26 pos_title: str,
27 categories: list[str],
28) -> None:
29 page_data.append(base_data.model_copy(deep=True))
30 page_data[-1].pos_title = pos_title
31 pos_data = POS_DATA[pos_title.lower()]
32 page_data[-1].pos = pos_data["pos"]
33 page_data[-1].tags.extend(pos_data.get("tags", []))
34 page_data[-1].categories.extend(categories)
36 first_gloss_index = len(level_node.children)
37 for index, list_node in level_node.find_child(NodeKind.LIST, True):
38 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
39 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
40 extract_gloss_list_item(wxr, page_data[-1], list_item)
41 if index < first_gloss_index: 41 ↛ 37line 41 didn't jump to line 37 because the condition on line 41 was always true
42 first_gloss_index = index
43 extract_head_line_nodes(
44 wxr, page_data[-1], level_node.children[:first_gloss_index]
45 )
46 # forms table template may not in header line
47 for t_node in level_node.find_child(NodeKind.TEMPLATE):
48 if t_node.template_name.startswith("flex."):
49 extract_flex_template(wxr, page_data[-1], t_node)
51 base_data_pos = page_data[-1].model_copy(deep=True)
52 first_child_section = True
53 for child_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
54 child_section = clean_node(wxr, None, child_level_node.largs)
55 if child_section in ["Brasil", "Portugal"]:
56 page_data.append(base_data_pos.model_copy(deep=True))
57 if first_child_section:
58 page_data.pop()
59 first_child_section = False
60 page_data[-1].raw_tags.append(child_section)
61 for list_node in child_level_node.find_child(NodeKind.LIST):
62 if list_node.sarg.startswith("#") and list_node.sarg.endswith( 62 ↛ 61line 62 didn't jump to line 61 because the condition on line 62 was always true
63 "#"
64 ):
65 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
66 extract_gloss_list_item(wxr, page_data[-1], list_item)
67 translate_raw_tags(page_data[-1])
70def extract_gloss_list_item(
71 wxr: WiktextractContext,
72 word_entry: WordEntry | Linkage,
73 list_item: WikiNode,
74 parent_gloss: list[str] = [],
75) -> None:
76 gloss_nodes = []
77 sense = Sense(glosses=parent_gloss)
78 for node in list_item.children:
79 if isinstance(node, TemplateNode):
80 if node.template_name == "escopo":
81 extract_escopo_template(wxr, sense, node)
82 elif node.template_name == "escopo2": 82 ↛ 85line 82 didn't jump to line 85 because the condition on line 82 was always true
83 sense.raw_tags.extend(extract_escopo2_template(wxr, node))
84 else:
85 gloss_nodes.append(node)
86 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
87 if node.sarg.endswith(("*", ":")):
88 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
89 extract_example_list_item(wxr, sense, next_list_item)
90 else:
91 gloss_nodes.append(node)
93 gloss_str = clean_node(wxr, sense, gloss_nodes)
94 if len(gloss_str) > 0: 94 ↛ 101line 94 didn't jump to line 101 because the condition on line 94 was always true
95 sense.glosses.append(gloss_str)
96 translate_raw_tags(sense)
97 if "form-of" in word_entry.tags:
98 extract_form_of_word(wxr, sense, list_item)
99 word_entry.senses.append(sense)
101 for child_list in list_item.find_child(NodeKind.LIST):
102 if child_list.sarg.endswith("#"):
103 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
104 extract_gloss_list_item(
105 wxr, word_entry, child_list_item, sense.glosses
106 )
109def extract_escopo_template(
110 wxr: WiktextractContext,
111 sense: Sense,
112 t_node: TemplateNode,
113) -> None:
114 # https://pt.wiktionary.org/wiki/Predefinição:escopo
115 expanded_str = clean_node(wxr, sense, t_node).strip("()")
116 for raw_tag in re.split(r", | e ", expanded_str):
117 if raw_tag.strip() != "": 117 ↛ 116line 117 didn't jump to line 116 because the condition on line 117 was always true
118 sense.raw_tags.append(raw_tag.strip())
121def extract_escopo2_template(
122 wxr: WiktextractContext,
123 t_node: TemplateNode,
124) -> list[str]:
125 # https://pt.wiktionary.org/wiki/Predefinição:escopo2
126 raw_tags = []
127 for arg in range(1, 4): 127 ↛ 133line 127 didn't jump to line 133 because the loop on line 127 didn't complete
128 if arg not in t_node.template_parameters:
129 break
130 raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
131 if raw_tag != "": 131 ↛ 127line 131 didn't jump to line 127 because the condition on line 131 was always true
132 raw_tags.append(raw_tag)
133 return raw_tags
136def extract_form_of_word(
137 wxr: WiktextractContext, sense: Sense, list_item: WikiNode
138) -> None:
139 form_of = ""
140 for link_node in list_item.find_child_recursively(NodeKind.LINK):
141 form_of = clean_node(wxr, None, link_node)
142 if form_of != "": 142 ↛ exitline 142 didn't return from function 'extract_form_of_word' because the condition on line 142 was always true
143 sense.form_of.append(AltForm(word=form_of))