Coverage for src/wiktextract/extractor/it/inflection.py: 91%

1import re

2from dataclasses import dataclass

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from .models import Form, WordEntry

9from .tags import translate_raw_tags

12def extract_tabs_template(

13 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode

14) -> None:

15 # https://it.wiktionary.org/wiki/Template:Tabs

16 tags = [

17 ["masculine", "singular"],

18 ["masculine", "plural"],

19 ["feminine", "singular"],

20 ["feminine", "plural"],

21 ]

22 for arg_name in range(1, 5):

23 arg_value = clean_node(

24 wxr, None, node.template_parameters.get(arg_name, "")

25 )

26 if arg_value not in ["", wxr.wtp.title]:

27 form = Form(form=arg_value, tags=tags[arg_name - 1])

28 word_entry.forms.append(form)

31def extract_it_decl_agg_template(

32 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

33) -> None:

34 # https://it.wiktionary.org/wiki/Template:It-decl-agg4

35 # https://it.wiktionary.org/wiki/Template:It-decl-agg2

36 expanded_node = wxr.wtp.parse(

37 wxr.wtp.node_to_wikitext(t_node), expand_all=True

38 )

39 for table in expanded_node.find_child(NodeKind.TABLE):

40 raw_tag = ""

41 col_tags = []

42 for row in table.find_child(NodeKind.TABLE_ROW):

43 row_tag = ""

44 col_index = 0

45 for cell in row.find_child(

46 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

47 ):

48 match cell.kind:

49 case NodeKind.TABLE_HEADER_CELL:

50 col_span = cell.attrs.get("colspan", "")

51 if col_span != "":

52 raw_tag = clean_node(wxr, None, cell)

53 elif (

54 len(

55 [

56 n

57 for n in row.find_child(

58 NodeKind.TABLE_HEADER_CELL

59 )

60 ]

61 )

62 == 1

63 ):

64 row_tag = clean_node(wxr, None, cell)

65 else:

66 col_header = clean_node(wxr, None, cell)

67 if col_header != "": 67 ↛ 45line 67 didn't jump to line 45 because the condition on line 67 was always true

68 col_tags.append(col_header)

69 case NodeKind.TABLE_CELL: 69 ↛ 45line 69 didn't jump to line 45 because the pattern on line 69 always matched

70 word = clean_node(wxr, None, cell)

71 if word not in ["", wxr.wtp.title]:

72 form = Form(form=word)

73 if raw_tag != "": 73 ↛ 75line 73 didn't jump to line 75 because the condition on line 73 was always true

74 form.raw_tags.append(raw_tag)

75 if row_tag != "": 75 ↛ 77line 75 didn't jump to line 77 because the condition on line 75 was always true

76 form.raw_tags.append(row_tag)

77 if col_index < len(col_tags): 77 ↛ 79line 77 didn't jump to line 79 because the condition on line 77 was always true

78 form.raw_tags.append(col_tags[col_index])

79 translate_raw_tags(form)

80 word_entry.forms.append(form)

81 col_index += 1

84def extract_appendix_conjugation_page(

85 wxr: WiktextractContext, word_entry: WordEntry, page_title: str

86) -> None:

87 # https://it.wiktionary.org/wiki/Appendice:Coniugazioni

88 page_text = wxr.wtp.get_page_body(page_title, 100)

89 if page_text is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 return

91 root = wxr.wtp.parse(page_text)

92 for t_node in root.find_child(NodeKind.TEMPLATE):

93 if t_node.template_name.lower() == "it-conj": 93 ↛ 92line 93 didn't jump to line 92 because the condition on line 93 was always true

94 extract_it_conj_template(wxr, word_entry, t_node, page_title)

97@dataclass

98class TableHeader:

99 text: str

100 col_index: int

101 colspan: int

102 row_index: int

103 rowspan: int

104

105

106def extract_it_conj_template(

107 wxr: WiktextractContext,

108 word_entry: WordEntry,

109 t_node: TemplateNode,

110 page_title: str,

111) -> None:

112 # https://it.wiktionary.org/wiki/Template:It-conj

113 expanded_node = wxr.wtp.parse(

114 wxr.wtp.node_to_wikitext(t_node), expand_all=True

115 )

116 for table in expanded_node.find_child(NodeKind.TABLE):

117 col_headers = []

118 row_header = ""

119 for row in table.find_child(NodeKind.TABLE_ROW):

120 col_index = 0

121 for cell in row.find_child(

122 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

123 ):

124 match cell.kind:

125 case NodeKind.TABLE_HEADER_CELL:

126 header_str = clean_node(wxr, None, cell)

127 if header_str in ["persona", "indicativo"]:

128 continue

129 elif header_str in ["condizionale", "congiuntivo"]: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 col_headers.clear()

131 continue

132 elif header_str == "imperativo":

133 col_headers.clear()

134 row_header = "imperativo"

135 continue

136

137 if row.contain_node(NodeKind.TABLE_CELL):

138 row_header = header_str

139 else:

140 colspan = 1

141 colspan_str = cell.attrs.get("colspan", "1")

142 if re.fullmatch(r"\d+", colspan_str): 142 ↛ 144line 142 didn't jump to line 144 because the condition on line 142 was always true

143 colspan = int(colspan_str)

144 col_headers.append(

145 TableHeader(

146 header_str, col_index, colspan, 0, 0

147 )

148 )

149 col_index += colspan

150 case NodeKind.TABLE_CELL: 150 ↛ 121line 150 didn't jump to line 121 because the pattern on line 150 always matched

151 cell_has_table = False

152 for cell_table in cell.find_child_recursively(

153 NodeKind.TABLE

154 ):

155 extract_it_conj_cell_table(

156 wxr,

157 word_entry,

158 cell_table,

159 row_header,

160 col_headers,

161 page_title,

162 )

163 cell_has_table = True

164 if not cell_has_table:

165 for form_str in clean_node(

166 wxr, None, cell

167 ).splitlines():

168 form_str = form_str.strip(", ")

169 if form_str.startswith("verbo di "): 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 continue # first row

171 if form_str not in ["", wxr.wtp.title]: 171 ↛ 165line 171 didn't jump to line 165 because the condition on line 171 was always true

172 add_it_conj_form(

173 word_entry,

174 form_str,

175 page_title,

176 row_header,

177 col_index,

178 col_headers,

179 )

180 col_index += 1

181

182

183def extract_it_conj_cell_table(

184 wxr: WiktextractContext,

185 word_entry: WordEntry,

186 table_node: WikiNode,

187 row_header: str,

188 col_headers: list[TableHeader],

189 page_title: str,

190) -> None:

191 for row in table_node.find_child(NodeKind.TABLE_ROW):

192 for col_index, cell in enumerate(row.find_child(NodeKind.TABLE_CELL)):

193 for cell_str in clean_node(wxr, None, cell).splitlines():

194 if cell_str not in ["", wxr.wtp.title]: 194 ↛ 193line 194 didn't jump to line 193 because the condition on line 194 was always true

195 add_it_conj_form(

196 word_entry,

197 cell_str,

198 page_title,

199 row_header,

200 col_index,

201 col_headers,

202 )

203

204

205def add_it_conj_form(

206 word_entry: WordEntry,

207 form_str: str,

208 page_title: str,

209 row_header: str,

210 col_index: int,

211 col_headers: list[TableHeader],

212) -> None:

213 form = Form(form=form_str, source=page_title)

214 if row_header != "": 214 ↛ 216line 214 didn't jump to line 216 because the condition on line 214 was always true

215 form.raw_tags.append(row_header)

216 for col_header in col_headers:

217 if (

218 col_index >= col_header.col_index

219 and col_index < col_header.col_index + col_header.colspan

220 ):

221 form.raw_tags.append(col_header.text)

222 translate_raw_tags(form)

223 word_entry.forms.append(form)