Coverage for src/wiktextract/extractor/es/conjugation.py: 93%

88 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10 

11def extract_conjugation_section( 

12 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

13) -> None: 

14 forms = [] 

15 cats = [] 

16 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

17 if t_node.template_name in ["es.v", "en.v"]: 17 ↛ 16line 17 didn't jump to line 16 because the condition on line 17 was always true

18 new_forms, new_cats = process_es_v_template(wxr, t_node) 

19 forms.extend(new_forms) 

20 cats.extend(new_cats) 

21 

22 for data in page_data: 

23 if ( 

24 data.lang_code == page_data[-1].lang_code 

25 and data.etymology_text == page_data[-1].etymology_text 

26 and data.pos == "verb" # should be fixed on Wiktionary 

27 ): 

28 data.forms.extend(forms) 

29 data.categories.extend(cats) 

30 

31 

32@dataclass 

33class SpanHeader: 

34 text: str 

35 index: int 

36 span: int 

37 

38 

39# https://en.wikipedia.org/wiki/Spanish_pronouns 

40PRONOUN_TAGS = { 

41 "yo": ["first-person", "singular"], 

42 "que yo": ["first-person", "singular"], 

43 "tú": ["second-person", "singular"], 

44 "que tú": ["second-person", "singular"], 

45 "(tú)": ["second-person", "singular"], 

46 "vos": ["second-person", "singular", "vos-form"], 

47 "que vos": ["second-person", "singular", "vos-form"], 

48 "(vos)": ["second-person", "singular", "vos-form"], 

49 "él, ella, usted": ["third-person", "singular"], 

50 "que él, que ella, que usted": ["third-person", "singular"], 

51 "(usted)": ["third-person", "singular"], 

52 "nosotros": ["first-person", "plural"], 

53 "que nosotros": ["first-person", "plural"], 

54 "(nosotros)": ["first-person", "plural"], 

55 "vosotros": ["second-person", "plural"], 

56 "que vosotros": ["second-person", "plural"], 

57 "(vosotros)": ["second-person", "plural"], 

58 "ustedes, ellos": ["third-person", "plural"], 

59 "que ustedes, que ellos": ["third-person", "plural"], 

60 "(ustedes)": ["third-person", "plural"], 

61 # Template:en.v 

62 "I": ["first-person"], 

63 "you": ["second-person"], 

64 "(you)": ["second-person"], 

65 "he, she, it": ["third-person", "singular"], 

66 "we, you, they": ["third-person", "plural"], 

67 "(we)": ["third-person", "plural"], 

68} 

69 

70 

71def process_es_v_template( 

72 wxr: WiktextractContext, template_node: TemplateNode 

73) -> tuple[list[Form], list[str]]: 

74 # https://es.wiktionary.org/wiki/Plantilla:es.v 

75 forms = [] 

76 cats = {} 

77 expanded_node = wxr.wtp.parse( 

78 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

79 ) 

80 clean_node(wxr, cats, expanded_node) 

81 table_nodes = list(expanded_node.find_child_recursively(NodeKind.TABLE)) 

82 if len(table_nodes) == 0: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 return [], [] 

84 table_node = table_nodes[0] 

85 col_headers = [] 

86 for row in table_node.find_child(NodeKind.TABLE_ROW): 

87 row_header = "" 

88 single_cell = len(list(row.filter_empty_str_child())) == 1 

89 all_header_row = row.contain_node( 

90 NodeKind.TABLE_HEADER_CELL 

91 ) and not row.contain_node(NodeKind.TABLE_CELL) 

92 if not all_header_row and single_cell: 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 continue # ignore end notes 

94 if all_header_row and single_cell: 

95 col_headers.clear() # new table 

96 

97 col_index = 0 

98 is_archaic_row = False 

99 for cell in row.find_child( 

100 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

101 ): 

102 cell_text = clean_node(wxr, None, cell) 

103 if cell_text == "": 

104 continue 

105 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

106 if all_header_row: 

107 colspan = int(cell.attrs.get("colspan", "1")) 

108 col_headers.append( 

109 SpanHeader( 

110 cell_text.removeprefix("Modo ").strip(), 

111 col_index, 

112 colspan, 

113 ) 

114 ) 

115 col_index += colspan 

116 else: 

117 is_archaic_row = cell_text.endswith("^†") 

118 row_header = cell_text.removesuffix("^†").strip() 

119 else: 

120 cell_nodes = [] 

121 for node in cell.children: 

122 if isinstance(node, HTMLNode) and node.tag == "sup": 

123 sup_tag = clean_node(wxr, None, node.children) 

124 if sup_tag != "" and len(forms) > 0: 124 ↛ 121line 124 didn't jump to line 121 because the condition on line 124 was always true

125 forms[-1].raw_tags.append(sup_tag) 

126 translate_raw_tags(forms[-1]) 

127 elif ( 

128 isinstance(node, WikiNode) 

129 and node.kind == NodeKind.LINK 

130 ): 

131 cell_nodes.append(node) 

132 forms.extend( 

133 process_es_v_cell( 

134 wxr, 

135 cell_nodes, 

136 col_index, 

137 col_headers, 

138 row_header, 

139 is_archaic_row, 

140 ) 

141 ) 

142 cell_nodes.clear() 

143 elif not ( 

144 isinstance(node, HTMLNode) 

145 and "movil" in node.attrs.get("class", "") 

146 ): 

147 cell_nodes.append(node) # hidden HTML tag 

148 if len(cell_nodes) > 0: 148 ↛ 159line 148 didn't jump to line 159 because the condition on line 148 was always true

149 forms.extend( 

150 process_es_v_cell( 

151 wxr, 

152 cell_nodes, 

153 col_index, 

154 col_headers, 

155 row_header, 

156 is_archaic_row, 

157 ) 

158 ) 

159 col_index += 1 

160 return forms, cats.get("categories", []) 

161 

162 

163def process_es_v_cell( 

164 wxr: WiktextractContext, 

165 cell_nodes: list[WikiNode | str], 

166 col_index: int, 

167 col_headers: list[SpanHeader], 

168 row_header: str, 

169 is_archaic: bool, 

170) -> list[Form]: 

171 forms = [] 

172 for form_str in clean_node(wxr, None, cell_nodes).split(","): 

173 form = Form(form=form_str.strip()) 

174 for col_head in col_headers: 

175 if ( 

176 col_index >= col_head.index 

177 and col_index < col_head.index + col_head.span 

178 ): 

179 form.raw_tags.append(col_head.text) 

180 form.tags.extend(PRONOUN_TAGS.get(col_head.text, [])) 

181 if row_header != "": 181 ↛ 183line 181 didn't jump to line 183 because the condition on line 181 was always true

182 form.raw_tags.append(row_header) 

183 if is_archaic: 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true

184 form.tags.append("archaic") 

185 if form.form not in ["", "―", wxr.wtp.title]: 

186 translate_raw_tags(form) 

187 forms.append(form) 

188 return forms