Coverage for src/wiktextract/extractor/es/conjugation.py: 93%

88 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10 

11def extract_conjugation_section( 

12 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

13) -> None: 

14 forms = [] 

15 cats = [] 

16 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

17 if t_node.template_name in ["es.v", "en.v", "de.v", "pl.v"]: 17 ↛ 16line 17 didn't jump to line 16 because the condition on line 17 was always true

18 new_forms, new_cats = process_es_v_template(wxr, t_node) 

19 forms.extend(new_forms) 

20 cats.extend(new_cats) 

21 

22 for data in page_data: 

23 if ( 

24 data.lang_code == page_data[-1].lang_code 

25 and data.etymology_text == page_data[-1].etymology_text 

26 and data.pos == "verb" # should be fixed on Wiktionary 

27 ): 

28 data.forms.extend(forms) 

29 data.categories.extend(cats) 

30 

31 

32@dataclass 

33class SpanHeader: 

34 text: str 

35 index: int 

36 span: int 

37 

38 

39# https://en.wikipedia.org/wiki/Spanish_pronouns 

40PRONOUN_TAGS = { 

41 "yo": ["first-person", "singular"], 

42 "que yo": ["first-person", "singular"], 

43 "tú": ["second-person", "singular"], 

44 "que tú": ["second-person", "singular"], 

45 "(tú)": ["second-person", "singular"], 

46 "vos": ["second-person", "singular", "vos-form"], 

47 "que vos": ["second-person", "singular", "vos-form"], 

48 "(vos)": ["second-person", "singular", "vos-form"], 

49 "él, ella, usted": ["third-person", "singular"], 

50 "que él, que ella, que usted": ["third-person", "singular"], 

51 "(usted)": ["third-person", "singular"], 

52 "nosotros": ["first-person", "plural"], 

53 "que nosotros": ["first-person", "plural"], 

54 "(nosotros)": ["first-person", "plural"], 

55 "vosotros": ["second-person", "plural"], 

56 "que vosotros": ["second-person", "plural"], 

57 "(vosotros)": ["second-person", "plural"], 

58 "ustedes, ellos": ["third-person", "plural"], 

59 "que ustedes, que ellos": ["third-person", "plural"], 

60 "(ustedes)": ["third-person", "plural"], 

61 # Template:en.v 

62 "I": ["first-person"], 

63 "you": ["second-person"], 

64 "(you)": ["second-person"], 

65 "he, she, it": ["third-person", "singular"], 

66 "we, you, they": ["third-person", "plural"], 

67 "(we)": ["third-person", "plural"], 

68 # Template:de.v 

69 "ich": ["first-person", "singular"], 

70 "du": ["second-person", "singular"], 

71 "er, sie, es": ["third-person", "singular"], 

72 "wir": ["first-person", "plural"], 

73 "ihr": ["second-person", "plural"], 

74 "sie": ["third-person", "plural"], 

75 "(du)": ["second-person", "singular"], 

76 "(wir)": ["first-person", "plural"], 

77 "(ihr)": ["second-person", "plural"], 

78 "(Sie)": ["third-person", "plural"], 

79 # Template:pl.v 

80 "ja": ["first-person", "singular"], 

81 "ty": ["second-person", "singular"], 

82 "on, ona, ono": ["third-person", "singular"], 

83 "my": ["first-person", "plural"], 

84 "wy": ["second-person", "plural"], 

85 "oni, one": ["third-person", "plural"], 

86 "(ty)": ["second-person", "singular"], 

87 "(my)": ["first-person", "plural"], 

88 "(wy)": ["second-person", "plural"], 

89} 

90 

91 

92def process_es_v_template( 

93 wxr: WiktextractContext, template_node: TemplateNode 

94) -> tuple[list[Form], list[str]]: 

95 # https://es.wiktionary.org/wiki/Plantilla:es.v 

96 forms = [] 

97 cats = {} 

98 expanded_node = wxr.wtp.parse( 

99 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

100 ) 

101 clean_node(wxr, cats, expanded_node) 

102 table_nodes = list(expanded_node.find_child_recursively(NodeKind.TABLE)) 

103 if len(table_nodes) == 0: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 return [], [] 

105 table_node = table_nodes[0] 

106 col_headers = [] 

107 for row in table_node.find_child(NodeKind.TABLE_ROW): 

108 row_header = "" 

109 single_cell = len(list(row.filter_empty_str_child())) == 1 

110 all_header_row = row.contain_node( 

111 NodeKind.TABLE_HEADER_CELL 

112 ) and not row.contain_node(NodeKind.TABLE_CELL) 

113 if not all_header_row and single_cell: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 continue # ignore end notes 

115 if all_header_row and single_cell: 

116 col_headers.clear() # new table 

117 

118 col_index = 0 

119 is_archaic_row = False 

120 for cell in row.find_child( 

121 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

122 ): 

123 cell_text = clean_node(wxr, None, cell) 

124 if cell_text == "": 

125 continue 

126 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

127 if all_header_row: 

128 colspan = int(cell.attrs.get("colspan", "1")) 

129 col_headers.append( 

130 SpanHeader( 

131 cell_text.removeprefix("Modo ").strip(), 

132 col_index, 

133 colspan, 

134 ) 

135 ) 

136 col_index += colspan 

137 else: 

138 is_archaic_row = cell_text.endswith("^†") 

139 row_header = cell_text.removesuffix("^†").strip() 

140 else: 

141 cell_nodes = [] 

142 for node in cell.children: 

143 if isinstance(node, HTMLNode) and node.tag == "sup": 

144 sup_tag = clean_node(wxr, None, node.children) 

145 if sup_tag != "" and len(forms) > 0: 145 ↛ 142line 145 didn't jump to line 142 because the condition on line 145 was always true

146 forms[-1].raw_tags.append(sup_tag) 

147 translate_raw_tags(forms[-1]) 

148 elif ( 

149 isinstance(node, WikiNode) 

150 and node.kind == NodeKind.LINK 

151 ): 

152 cell_nodes.append(node) 

153 forms.extend( 

154 process_es_v_cell( 

155 wxr, 

156 cell_nodes, 

157 col_index, 

158 col_headers, 

159 row_header, 

160 is_archaic_row, 

161 ) 

162 ) 

163 cell_nodes.clear() 

164 elif not ( 

165 isinstance(node, HTMLNode) 

166 and "movil" in node.attrs.get("class", "") 

167 ): 

168 cell_nodes.append(node) # hidden HTML tag 

169 if len(cell_nodes) > 0: 169 ↛ 180line 169 didn't jump to line 180 because the condition on line 169 was always true

170 forms.extend( 

171 process_es_v_cell( 

172 wxr, 

173 cell_nodes, 

174 col_index, 

175 col_headers, 

176 row_header, 

177 is_archaic_row, 

178 ) 

179 ) 

180 col_index += 1 

181 return forms, cats.get("categories", []) 

182 

183 

184def process_es_v_cell( 

185 wxr: WiktextractContext, 

186 cell_nodes: list[WikiNode | str], 

187 col_index: int, 

188 col_headers: list[SpanHeader], 

189 row_header: str, 

190 is_archaic: bool, 

191) -> list[Form]: 

192 forms = [] 

193 for form_str in clean_node(wxr, None, cell_nodes).split(","): 

194 form = Form(form=form_str.strip()) 

195 for col_head in col_headers: 

196 if ( 

197 col_index >= col_head.index 

198 and col_index < col_head.index + col_head.span 

199 ): 

200 form.raw_tags.append(col_head.text) 

201 form.tags.extend(PRONOUN_TAGS.get(col_head.text, [])) 

202 if row_header != "": 202 ↛ 204line 202 didn't jump to line 204 because the condition on line 202 was always true

203 form.raw_tags.append(row_header) 

204 if is_archaic: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 form.tags.append("archaic") 

206 if form.form not in ["", "―", wxr.wtp.title]: 

207 translate_raw_tags(form) 

208 forms.append(form) 

209 return forms