Coverage for src/wiktextract/extractor/es/conjugation.py: 91%

81 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10 

11def extract_conjugation_section( 

12 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

13) -> None: 

14 forms = [] 

15 cats = [] 

16 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

17 if t_node.template_name == "es.v": 17 ↛ 16line 17 didn't jump to line 16 because the condition on line 17 was always true

18 new_forms, new_cats = process_es_v_template(wxr, t_node) 

19 forms.extend(new_forms) 

20 cats.extend(new_cats) 

21 

22 for data in page_data: 

23 if ( 

24 data.lang_code == page_data[-1].lang_code 

25 and data.etymology_text == page_data[-1].etymology_text 

26 and data.pos == "verb" # should be fixed on Wiktionary 

27 ): 

28 data.forms.extend(forms) 

29 data.categories.extend(cats) 

30 

31 

32@dataclass 

33class SpanHeader: 

34 text: str 

35 index: int 

36 span: int 

37 

38 

39# https://en.wikipedia.org/wiki/Spanish_pronouns 

40PRONOUN_TAGS = { 

41 "yo": ["first-person", "singular"], 

42 "que yo": ["first-person", "singular"], 

43 "tú": ["second-person", "singular"], 

44 "que tú": ["second-person", "singular"], 

45 "(tú)": ["second-person", "singular"], 

46 "vos": ["second-person", "singular", "vos-form"], 

47 "que vos": ["second-person", "singular", "vos-form"], 

48 "(vos)": ["second-person", "singular", "vos-form"], 

49 "él, ella, usted": ["third-person", "singular"], 

50 "que él, que ella, que usted": ["third-person", "singular"], 

51 "(usted)": ["third-person", "singular"], 

52 "nosotros": ["first-person", "plural"], 

53 "que nosotros": ["first-person", "plural"], 

54 "(nosotros)": ["first-person", "plural"], 

55 "vosotros": ["second-person", "plural"], 

56 "que vosotros": ["second-person", "plural"], 

57 "(vosotros)": ["second-person", "plural"], 

58 "ustedes, ellos": ["third-person", "plural"], 

59 "que ustedes, que ellos": ["third-person", "plural"], 

60 "(ustedes)": ["third-person", "plural"], 

61} 

62 

63 

64def process_es_v_template( 

65 wxr: WiktextractContext, template_node: TemplateNode 

66) -> tuple[list[Form], list[str]]: 

67 # https://es.wiktionary.org/wiki/Plantilla:es.v 

68 forms = [] 

69 cats = {} 

70 expanded_node = wxr.wtp.parse( 

71 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

72 ) 

73 clean_node(wxr, cats, expanded_node) 

74 table_nodes = list(expanded_node.find_child_recursively(NodeKind.TABLE)) 

75 if len(table_nodes) == 0: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 return [], [] 

77 table_node = table_nodes[0] 

78 col_headers = [] 

79 for row in table_node.find_child(NodeKind.TABLE_ROW): 

80 row_header = "" 

81 single_cell = len(list(row.filter_empty_str_child())) == 1 

82 all_header_row = row.contain_node( 

83 NodeKind.TABLE_HEADER_CELL 

84 ) and not row.contain_node(NodeKind.TABLE_CELL) 

85 if not all_header_row and single_cell: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 continue # ignore end notes 

87 if all_header_row and single_cell: 

88 col_headers.clear() # new table 

89 

90 col_index = 0 

91 is_archaic_row = False 

92 for cell in row.find_child( 

93 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

94 ): 

95 cell_text = clean_node(wxr, None, cell) 

96 if cell_text == "": 

97 continue 

98 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

99 if all_header_row: 

100 colspan = int(cell.attrs.get("colspan", "1")) 

101 col_headers.append( 

102 SpanHeader( 

103 cell_text.removeprefix("Modo ").strip(), 

104 col_index, 

105 colspan, 

106 ) 

107 ) 

108 col_index += colspan 

109 else: 

110 is_archaic_row = cell_text.endswith("^†") 

111 row_header = cell_text.removesuffix("^†").strip() 

112 else: 

113 cell_nodes = [] 

114 for node in cell.children: 

115 if isinstance(node, HTMLNode) and node.tag == "sup": 

116 sup_tag = clean_node(wxr, None, node.children) 

117 if sup_tag != "" and len(forms) > 0: 117 ↛ 114line 117 didn't jump to line 114 because the condition on line 117 was always true

118 forms[-1].raw_tags.append(sup_tag) 

119 translate_raw_tags(forms[-1]) 

120 elif ( 

121 isinstance(node, WikiNode) 

122 and node.kind == NodeKind.LINK 

123 ): 

124 cell_nodes.append(node) 

125 form = Form( 

126 form=clean_node(wxr, None, cell_nodes).lstrip(", ") 

127 ) 

128 for col_head in col_headers: 

129 if ( 129 ↛ 128line 129 didn't jump to line 128 because the condition on line 129 was always true

130 col_index >= col_head.index 

131 and col_index < col_head.index + col_head.span 

132 ): 

133 form.raw_tags.append(col_head.text) 

134 form.tags.extend( 

135 PRONOUN_TAGS.get(col_head.text, []) 

136 ) 

137 if row_header != "": 137 ↛ 139line 137 didn't jump to line 139 because the condition on line 137 was always true

138 form.raw_tags.append(row_header) 

139 if is_archaic_row: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 form.tags.append("archaic") 

141 if form.form not in ["", "―"]: 141 ↛ 144line 141 didn't jump to line 144 because the condition on line 141 was always true

142 translate_raw_tags(form) 

143 forms.append(form) 

144 cell_nodes.clear() 

145 elif not ( 

146 isinstance(node, HTMLNode) 

147 and "movil" in node.attrs.get("class", "") 

148 ): 

149 cell_nodes.append(node) # hidden HTML tag 

150 col_index += 1 

151 return forms, cats.get("categories", [])