Coverage for src/wiktextract/extractor/de/inflection.py: 86%

1import re

2from dataclasses import dataclass

4from wikitextprocessor import NodeKind, TemplateNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from .flexion import parse_flexion_page

9from .models import Form, WordEntry

10from .tags import translate_raw_tags

12# Kategorie:Wiktionary:Flexionstabelle (Deutsch)

15def extract_inf_table_template(

16 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

17) -> None:

18 if (

19 "Substantiv Übersicht" in t_node.template_name

20 or t_node.template_name.endswith(

21 (

22 "Nachname Übersicht",

23 "Eigenname Übersicht",

24 "Vorname Übersicht m",

25 "Name Übersicht",

26 "Pronomina-Tabelle",

27 "Pronomen Übersicht",

28 "adjektivisch Übersicht",

29 "Substantiv Dialekt",

30 "Toponym Übersicht",

31 )

32 )

33 or re.search(r" Personalpronomen \d$", t_node.template_name)

34 ):

35 process_noun_table(wxr, word_entry, t_node)

36 elif t_node.template_name.endswith(

37 ("Adjektiv Übersicht", "Adverb Übersicht")

38 ):

39 process_adj_table(wxr, word_entry, t_node)

40 elif (

41 t_node.template_name.endswith("Verb Übersicht")

42 or t_node.template_name == "Kardinalzahl 2-12"

43 ):

44 process_verb_table(wxr, word_entry, t_node)

45 elif t_node.template_name == "Deutsch Possessivpronomen": 45 ↛ exitline 45 didn't return from function 'extract_inf_table_template' because the condition on line 45 was always true

46 extract_pronoun_table(wxr, word_entry, t_node)

49@dataclass

50class RowspanHeader:

51 text: str

52 index: int

53 span: int

56def process_verb_table(

57 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode

58) -> None:

59 # Vorlage:Deutsch Verb Übersicht

60 expanded_template = wxr.wtp.parse(

61 wxr.wtp.node_to_wikitext(template_node), expand_all=True

62 )

63 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))

64 if len(table_nodes) == 0: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return

66 table_node = table_nodes[0]

67 col_headers = []

68 has_person = False

69 row_headers = []

70 for table_row in table_node.find_child(NodeKind.TABLE_ROW):

71 col_index = 0

72 header_col_index = 0

73 person = ""

74 for table_cell in table_row.find_child(

75 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

76 ):

77 cell_text = clean_node(wxr, None, table_cell)

78 if cell_text.startswith("All other forms:"):

79 for link_node in table_cell.find_child_recursively(

80 NodeKind.LINK

81 ):

82 link_text = clean_node(wxr, None, link_node)

83 if link_text.startswith("Flexion:"): 83 ↛ 79line 83 didn't jump to line 79 because the condition on line 83 was always true

84 parse_flexion_page(wxr, word_entry, link_text)

85 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:

86 if cell_text == "":

87 continue

88 elif header_col_index == 0:

89 rowspan = int(table_cell.attrs.get("rowspan", "1"))

90 row_headers.append(RowspanHeader(cell_text, 0, rowspan))

91 elif cell_text in ("Person", "Wortform"):

92 has_person = True

93 else: # new table

94 col_headers.append(cell_text)

95 has_person = False

96 person = ""

97 header_col_index += 1

98 elif table_cell.kind == NodeKind.TABLE_CELL: 98 ↛ 74line 98 didn't jump to line 74 because the condition on line 98 was always true

99 if has_person and col_index == 0:

100 if cell_text in ("Singular", "Plural"):

101 row_headers.append(RowspanHeader(cell_text, 0, 1))

102 else:

103 person = cell_text

104 else:

105 for cell_line in cell_text.splitlines():

106 cell_line = cell_line.strip()

107 if cell_line in ["", "—"]: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 continue

109 elif cell_line.startswith("Flexion:"): 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 parse_flexion_page(wxr, word_entry, cell_line)

111 continue

112 for p in person.split(","):

113 p = p.strip()

114 form_text = cell_line

115 if p != "":

116 form_text = p + " " + cell_line

117 if form_text == wxr.wtp.title:

118 continue

119 form = Form(form=form_text)

120 if col_index < len(col_headers):

121 form.raw_tags.append(col_headers[col_index])

122 for row_header in row_headers:

123 form.raw_tags.append(row_header.text)

124 translate_raw_tags(form)

125 word_entry.forms.append(form)

126 col_index += 1

127

128 new_row_headers = []

129 for row_header in row_headers:

130 if row_header.span > 1:

131 row_header.span -= 1

132 new_row_headers.append(row_header)

133 row_headers = new_row_headers

134

135

136def process_noun_table(

137 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode

138) -> None:

139 # Vorlage:Deutsch Substantiv Übersicht

140 from .page import extract_note_section

141

142 expanded_template = wxr.wtp.parse(

143 wxr.wtp.node_to_wikitext(template_node), expand_all=True

144 )

145 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))

146 if len(table_nodes) == 0: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 return

148 table_node = table_nodes[0]

149 column_headers = []

150 table_header = ""

151 for table_row in table_node.find_child(NodeKind.TABLE_ROW):

152 row_header = ""

153 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL)

154 row_has_header = table_row.contain_node(NodeKind.TABLE_HEADER_CELL)

155 col_index = 0

156 for table_cell in table_row.find_child(

157 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

158 ):

159 cell_text = clean_node(wxr, None, table_cell)

160 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:

161 if (

162 cell_text in ["", "Kasus", "Utrum", "m", "f", "m, f"]

163 and col_index == 0

164 ):

165 continue

166 elif is_header_row:

167 colspan = int(table_cell.attrs.get("colspan", "1"))

168 if cell_text != "": 168 ↛ 176line 168 didn't jump to line 176 because the condition on line 168 was always true

169 column_headers.append(

170 RowspanHeader(

171 re.sub(r"\s*\d+$", "", cell_text),

172 col_index,

173 colspan,

174 )

175 )

176 col_index += colspan

177 else:

178 row_header = cell_text

179 elif cell_text == "": 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true

180 continue

181 elif not row_has_header: 181 ↛ 183line 181 didn't jump to line 183 because the condition on line 181 was never true

182 # Vorlage:Deutsch adjektivisch Übersicht

183 table_header = cell_text

184 column_headers.clear()

185 for link_node in table_cell.find_child(NodeKind.LINK):

186 link_text = clean_node(wxr, None, link_node)

187 if link_text.startswith("Flexion:"):

188 parse_flexion_page(wxr, word_entry, link_text)

189 else:

190 for form_text in cell_text.splitlines():

191 form_text = form_text.strip()

192 if form_text.startswith("(") and form_text.endswith(")"): 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true

193 form_text = form_text.strip("() ")

194 if form_text in ["—", "–", "-", "", "?", wxr.wtp.title]: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true

195 continue

196 form = Form(form=form_text)

197 if table_header != "": 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true

198 form.raw_tags.append(table_header)

199 if len(row_header) > 0: 199 ↛ 201line 199 didn't jump to line 201 because the condition on line 199 was always true

200 form.raw_tags.append(row_header)

201 for col_header in column_headers:

202 if (

203 col_header.text not in ("", "—")

204 and col_index >= col_header.index

205 and col_index < col_header.index + col_header.span

206 ):

207 form.raw_tags.append(col_header.text)

208 translate_raw_tags(form)

209 word_entry.forms.append(form)

210 col_index += 1

211

212 clean_node(wxr, word_entry, expanded_template) # category links

213 # Vorlage:Deutsch Nachname Übersicht

214 for level_node in expanded_template.find_child(NodeKind.LEVEL4): 214 ↛ 215line 214 didn't jump to line 215 because the loop on line 214 never started

215 section_text = clean_node(wxr, None, level_node.largs)

216 if section_text.startswith("Anmerkung"):

217 extract_note_section(wxr, word_entry, level_node)

218

219

220def process_adj_table(

221 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode

222) -> None:

223 # Vorlage:Deutsch Adjektiv Übersicht

224 expanded_template = wxr.wtp.parse(

225 wxr.wtp.node_to_wikitext(template_node), expand_all=True

226 )

227 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))

228 if len(table_nodes) == 0: 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true

229 return

230 table_node = table_nodes[0]

231 column_headers = []

232 for table_row in table_node.find_child(NodeKind.TABLE_ROW):

233 for col_index, table_cell in enumerate(

234 table_row.find_child(

235 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

236 )

237 ):

238 cell_text = clean_node(wxr, None, table_cell)

239 # because {{int:}} magic word is not implemented

240 # template "Textbaustein-Intl" expands to English words

241 if cell_text.startswith("All other forms:"):

242 for link_node in table_cell.find_child(NodeKind.LINK):

243 parse_flexion_page(

244 wxr, word_entry, clean_node(wxr, None, link_node)

245 )

246 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL:

247 column_headers.append(cell_text)

248 else:

249 for form_text in cell_text.splitlines():

250 if form_text in ("—", "", "?"): 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true

251 continue

252 form = Form(form=form_text)

253 if col_index < len(column_headers): 253 ↛ 255line 253 didn't jump to line 255 because the condition on line 253 was always true

254 form.raw_tags.append(column_headers[col_index])

255 translate_raw_tags(form)

256 word_entry.forms.append(form)

257

258

259def extract_pronoun_table(

260 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

261) -> None:

262 # Vorlage:Deutsch Possessivpronomen

263 expanded_template = wxr.wtp.parse(

264 wxr.wtp.node_to_wikitext(t_node), expand_all=True

265 )

266 table_nodes = list(expanded_template.find_child(NodeKind.TABLE))

267 if len(table_nodes) == 0: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 return

269 table_node = table_nodes[0]

270 col_headers = []

271 table_header = ""

272 for row in table_node.find_child(NodeKind.TABLE_ROW):

273 row_header = ""

274 row_has_data = row.contain_node(NodeKind.TABLE_CELL)

275 col_index = 0

276 article = ""

277 for cell in row.find_child(

278 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

279 ):

280 cell_text = clean_node(wxr, None, cell)

281 if cell.kind == NodeKind.TABLE_HEADER_CELL:

282 if cell_text == "":

283 continue

284 elif row_has_data:

285 row_header = cell_text

286 elif len(list(row.find_child(NodeKind.TABLE_HEADER_CELL))) == 1:

287 table_header = cell_text

288 col_headers.clear() # new table

289 article = ""

290 else:

291 colspan = 1

292 colspan_str = cell.attrs.get("colspan", "1")

293 if re.fullmatch(r"\d+", colspan_str): 293 ↛ 295line 293 didn't jump to line 295 because the condition on line 293 was always true

294 colspan = int(colspan_str)

295 if cell_text != "—":

296 col_headers.append(

297 RowspanHeader(cell_text, col_index, colspan)

298 )

299 col_index += colspan

300 elif cell.kind == NodeKind.TABLE_CELL: 300 ↛ 277line 300 didn't jump to line 277 because the condition on line 300 was always true

301 if col_index % 2 == 0:

302 article = cell_text

303 else:

304 form_str = (

305 article + " " + cell_text

306 if article not in ["", "—"]

307 else cell_text

308 )

309 form = Form(form=form_str)

310 if table_header != "": 310 ↛ 312line 310 didn't jump to line 312 because the condition on line 310 was always true

311 form.raw_tags.append(table_header)

312 if row_header != "": 312 ↛ 314line 312 didn't jump to line 314 because the condition on line 312 was always true

313 form.raw_tags.append(row_header)

314 for header in col_headers:

315 if (

316 col_index >= header.index

317 and col_index < header.index + header.span

318 and header.text != "Wortform"

319 ):

320 form.raw_tags.append(header.text)

321 translate_raw_tags(form)

322 if form.form != wxr.wtp.title:

323 word_entry.forms.append(form)

324 article = ""

325 col_index += 1