Coverage for src/wiktextract/extractor/pt/pos.py: 90%

105 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2 

3from wikitextprocessor import ( 

4 HTMLNode, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .head_line import extract_head_line_nodes 

14from .inflection import extract_flex_template 

15from .models import Example, Linkage, Sense, WordEntry 

16from .section_titles import POS_DATA 

17from .tags import translate_raw_tags 

18 

19 

20def extract_pos_section( 

21 wxr: WiktextractContext, 

22 page_data: list[WordEntry], 

23 base_data: WordEntry, 

24 level_node: LevelNode, 

25 pos_title: str, 

26 categories: list[str], 

27) -> None: 

28 page_data.append(base_data.model_copy(deep=True)) 

29 page_data[-1].pos_title = pos_title 

30 pos_data = POS_DATA[pos_title.lower()] 

31 page_data[-1].pos = pos_data["pos"] 

32 page_data[-1].tags.extend(pos_data.get("tags", [])) 

33 page_data[-1].categories.extend(categories) 

34 

35 first_gloss_index = len(level_node.children) 

36 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

37 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 37 ↛ 36line 37 didn't jump to line 36 because the condition on line 37 was always true

38 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

39 extract_gloss_list_item(wxr, page_data[-1], list_item) 

40 if index < first_gloss_index: 40 ↛ 36line 40 didn't jump to line 36 because the condition on line 40 was always true

41 first_gloss_index = index 

42 extract_head_line_nodes( 

43 wxr, page_data[-1], level_node.children[:first_gloss_index] 

44 ) 

45 # forms table template may not in header line 

46 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

47 if t_node.template_name.startswith("flex."): 

48 extract_flex_template(wxr, page_data[-1], t_node) 

49 

50 

51def extract_gloss_list_item( 

52 wxr: WiktextractContext, 

53 word_entry: WordEntry | Linkage, 

54 list_item: WikiNode, 

55 parent_gloss: list[str] = [], 

56) -> None: 

57 gloss_nodes = [] 

58 sense = Sense(glosses=parent_gloss) 

59 for node in list_item.children: 

60 if isinstance(node, TemplateNode): 

61 if node.template_name == "escopo": 

62 extract_escopo_template(wxr, sense, node) 

63 elif node.template_name == "escopo2": 63 ↛ 66line 63 didn't jump to line 66 because the condition on line 63 was always true

64 sense.raw_tags.extend(extract_escopo2_template(wxr, node)) 

65 else: 

66 gloss_nodes.append(node) 

67 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

68 if node.sarg.endswith(("*", ":")): 

69 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

70 extract_example_list_item(wxr, sense, next_list_item) 

71 else: 

72 gloss_nodes.append(node) 

73 

74 gloss_str = clean_node(wxr, sense, gloss_nodes) 

75 if len(gloss_str) > 0: 75 ↛ 80line 75 didn't jump to line 80 because the condition on line 75 was always true

76 sense.glosses.append(gloss_str) 

77 translate_raw_tags(sense) 

78 word_entry.senses.append(sense) 

79 

80 for child_list in list_item.find_child(NodeKind.LIST): 

81 if child_list.sarg.endswith("#"): 

82 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

83 extract_gloss_list_item( 

84 wxr, word_entry, child_list_item, sense.glosses 

85 ) 

86 

87 

88def extract_escopo_template( 

89 wxr: WiktextractContext, 

90 sense: Sense, 

91 t_node: TemplateNode, 

92) -> None: 

93 # https://pt.wiktionary.org/wiki/Predefinição:escopo 

94 expanded_str = clean_node(wxr, sense, t_node).strip("()") 

95 for raw_tag in re.split(r", | e ", expanded_str): 

96 if raw_tag.strip() != "": 96 ↛ 95line 96 didn't jump to line 95 because the condition on line 96 was always true

97 sense.raw_tags.append(raw_tag.strip()) 

98 

99 

100def extract_escopo2_template( 

101 wxr: WiktextractContext, 

102 t_node: TemplateNode, 

103) -> list[str]: 

104 # https://pt.wiktionary.org/wiki/Predefinição:escopo2 

105 raw_tags = [] 

106 for arg in range(1, 4): 106 ↛ 112line 106 didn't jump to line 112 because the loop on line 106 didn't complete

107 if arg not in t_node.template_parameters: 

108 break 

109 raw_tag = clean_node(wxr, None, t_node.template_parameters[arg]) 

110 if raw_tag != "": 110 ↛ 106line 110 didn't jump to line 106 because the condition on line 110 was always true

111 raw_tags.append(raw_tag) 

112 return raw_tags 

113 

114 

115def extract_example_list_item( 

116 wxr: WiktextractContext, 

117 sense: Sense, 

118 list_item: WikiNode, 

119) -> None: 

120 example = Example() 

121 ref_nodes = [] 

122 

123 for index, node in enumerate(list_item.children): 

124 if ( 

125 isinstance(node, WikiNode) 

126 and node.kind == NodeKind.ITALIC 

127 and example.text == "" 

128 ): 

129 example.text = clean_node(wxr, None, node) 

130 elif isinstance(node, HTMLNode) and node.tag == "small": 

131 example.translation = clean_node(wxr, None, node) 

132 if example.translation.startswith( 132 ↛ 135line 132 didn't jump to line 135 because the condition on line 132 was never true

133 "(" 

134 ) and example.translation.endswith(")"): 

135 example.translation = example.translation.strip("()") 

136 elif isinstance(node, TemplateNode): 

137 match node.template_name: 

138 case "OESP": 

139 example.ref = clean_node(wxr, sense, node).strip("()") 

140 case "tradex": 140 ↛ 148line 140 didn't jump to line 148 because the pattern on line 140 always matched

141 example.text = clean_node( 

142 wxr, None, node.template_parameters.get(2, "") 

143 ) 

144 example.translation = clean_node( 

145 wxr, None, node.template_parameters.get(3, "") 

146 ) 

147 clean_node(wxr, sense, node) 

148 case "Ex.": 

149 example.text = clean_node( 

150 wxr, sense, node.template_parameters.get(1, "") 

151 ) 

152 elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: 

153 bold_str = clean_node(wxr, None, node) 

154 if re.fullmatch(r"\d+", bold_str) is not None: 154 ↛ 123line 154 didn't jump to line 123 because the condition on line 154 was always true

155 list_item_str = clean_node( 

156 wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) 

157 ) 

158 if list_item_str.endswith(":"): 158 ↛ 123line 158 didn't jump to line 123 because the condition on line 158 was always true

159 ref_nodes.clear() 

160 example.ref = list_item_str 

161 for child_list in list_item.find_child(NodeKind.LIST): 

162 for child_list_item in child_list.find_child( 

163 NodeKind.LIST_ITEM 

164 ): 

165 example.text = clean_node( 

166 wxr, None, child_list_item.children 

167 ) 

168 break 

169 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

170 ref_nodes.clear() 

171 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

172 ref_nodes.append(child_list_item.children) 

173 else: 

174 ref_nodes.append(node) 

175 

176 if example.text != "": 176 ↛ exitline 176 didn't return from function 'extract_example_list_item' because the condition on line 176 was always true

177 if example.ref == "": 

178 example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n") 

179 sense.examples.append(example)