Coverage for src/wiktextract/extractor/de/flexion.py: 70%

1from dataclasses import dataclass

3from wikitextprocessor.parser import (

4 LEVEL_KIND_FLAGS,

5 HTMLNode,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from .models import Form, WordEntry

15from .tags import GRAMMATICAL_TAGS, translate_raw_tags

18def parse_flexion_page(

19 wxr: WiktextractContext, word_entry: WordEntry, page_title: str

20) -> None:

21 # https://de.wiktionary.org/wiki/Hilfe:Flexionsseiten

22 LEVEL2_TAGS = ["Hilfsverb haben", "Hilfsverb sein"]

24 flexion_page = wxr.wtp.get_page_body(

25 page_title, wxr.wtp.NAMESPACE_DATA["Flexion"]["id"]

26 )

27 if flexion_page is None: 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true

28 return

29 flexion_root = wxr.wtp.parse(flexion_page)

30 shared_raw_tags = []

31 for node in flexion_root.find_child_recursively(

32 NodeKind.TEMPLATE | NodeKind.LEVEL2

33 ):

34 match node.kind:

35 case NodeKind.LEVEL2:

36 shared_raw_tags.clear()

37 section_str = clean_node(wxr, None, node.largs)

38 for word in section_str.split(" "):

39 word = word.strip(", ")

40 if word in GRAMMATICAL_TAGS and not page_title.endswith( 40 ↛ 43line 40 didn't jump to line 43 because the condition on line 40 was never true

41 f":{word}"

42 ):

43 shared_raw_tags.append(word)

44 for raw_tag in LEVEL2_TAGS:

45 if raw_tag in section_str: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 shared_raw_tags.append(raw_tag)

47 case NodeKind.TEMPLATE: 47 ↛ 31line 47 didn't jump to line 31 because the pattern on line 47 always matched

48 if node.template_name == "Deklinationsseite Numerale": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 extract_deklinationsseite_numerale_template(

50 wxr, word_entry, node, page_title

51 )

52 elif node.template_name.startswith("Deklinationsseite"):

53 process_deklinationsseite_template(

54 wxr, word_entry, node, page_title

55 )

56 elif node.template_name.startswith("Deutsch Verb"):

57 process_deutsch_verb_template(

58 wxr, word_entry, node, page_title, shared_raw_tags

59 )

62@dataclass

63class SpanHeader:

64 text: str

65 index: int

66 span: int

69def process_deklinationsseite_template(

70 wxr: WiktextractContext,

71 word_entry: WordEntry,

72 template_node: TemplateNode,

73 page_tite: str,

74) -> None:

75 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Adjektiv

76 expanded_template = wxr.wtp.parse(

77 wxr.wtp.node_to_wikitext(template_node), expand_all=True

78 )

79 h4_text = ""

80 for node in expanded_template.find_child(NodeKind.HTML | NodeKind.TABLE):

81 if isinstance(node, HTMLNode) and node.tag == "h4":

82 h4_text = clean_node(wxr, None, node)

83 elif node.kind == NodeKind.TABLE: 83 ↛ 80line 83 didn't jump to line 80 because the condition on line 83 was always true

84 col_headers = []

85 has_article = False

86 for row_node in node.find_child(NodeKind.TABLE_ROW):

87 col_index = 0

88 row_header = ""

89 article = ""

90 for cell_node in row_node.find_child(

91 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

92 ):

93 cell_text = clean_node(wxr, None, cell_node)

94 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:

95 if cell_text == "":

96 continue

97 elif cell_text in ("Artikel", "Wortform"):

98 has_article = True

99 continue

100 elif "colspan" in cell_node.attrs:

101 col_span = int(cell_node.attrs.get("colspan"))

102 if col_span == 9: # new table

103 has_article = False

104 col_headers.clear()

105 col_headers.append(

106 SpanHeader(cell_text, col_index, col_span)

107 )

108 col_index += col_span

109 else:

110 row_header = cell_text

111 elif cell_node.kind == NodeKind.TABLE_CELL: 111 ↛ 90line 111 didn't jump to line 90 because the condition on line 111 was always true

112 if has_article and col_index % 2 == 0:

113 article = cell_text

114 else:

115 form_text = ""

116 if article not in ("", "—"):

117 form_text = article + " "

118 raw_tags = []

119 if h4_text != "": 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true

120 raw_tags.append(h4_text)

121 if row_header != "":

122 raw_tags.append(row_header)

123 for col_header in col_headers:

124 if (

125 col_header.text not in ("", "—")

126 and col_index >= col_header.index

127 and col_index

128 < col_header.index + col_header.span

129 ):

130 raw_tags.append(col_header.text)

131 for line in cell_text.splitlines():

132 form = Form(

133 form=form_text + line,

134 source=page_tite,

135 raw_tags=raw_tags,

136 )

137 if form.form not in ("", "—"): 137 ↛ 131line 137 didn't jump to line 131 because the condition on line 137 was always true

138 translate_raw_tags(form)

139 word_entry.forms.append(form)

140 col_index += int(cell_node.attrs.get("colspan", "1"))

141

142

143def process_deutsch_verb_template(

144 wxr: WiktextractContext,

145 word_entry: WordEntry,

146 template_node: TemplateNode,

147 page_tite: str,

148 shared_raw_tags: list[str],

149) -> None:

150 # Vorlage:Deutsch Verb regelmäßig

151 expanded_template = wxr.wtp.parse(

152 wxr.wtp.node_to_wikitext(template_node), expand_all=True

153 )

154 for level_node in expanded_template.find_child(LEVEL_KIND_FLAGS):

155 process_deutsch_verb_section(

156 wxr, word_entry, level_node, page_tite, shared_raw_tags

157 )

158

159

160def process_deutsch_verb_section(

161 wxr: WiktextractContext,

162 word_entry: WordEntry,

163 level_node: LevelNode,

164 page_tite: str,

165 shared_raw_tags: list[str],

166) -> None:

167 section_title = clean_node(wxr, None, level_node.largs)

168 new_raw_tags = shared_raw_tags.copy()

169 new_raw_tags.append(section_title)

170 for table_node in level_node.find_child(NodeKind.TABLE):

171 process_deutsch_verb_table(

172 wxr, word_entry, table_node, page_tite, new_raw_tags

173 )

174 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 174 ↛ 175line 174 didn't jump to line 175 because the loop on line 174 never started

175 process_deutsch_verb_section(

176 wxr, word_entry, next_level, page_tite, new_raw_tags

177 )

178

179

180def process_deutsch_verb_table(

181 wxr: WiktextractContext,

182 word_entry: WordEntry,

183 table: WikiNode,

184 page_tite: str,

185 shared_raw_tags: list[str],

186) -> None:

187 col_headers = []

188 for row in table.find_child(NodeKind.TABLE_ROW):

189 row_header = ""

190 col_index = 0

191 col_header_index = 0

192 is_bold_col_header = all(

193 c.contain_node(NodeKind.BOLD)

194 for c in row.find_child(NodeKind.TABLE_CELL)

195 if clean_node(wxr, None, c) != ""

196 )

197 if (

198 len(

199 list(

200 row.find_child(

201 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

202 )

203 )

204 )

205 == 1

206 ):

207 col_headers.clear() # new table

208 for cell in row.find_child(

209 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

210 ):

211 cell_text = clean_node(wxr, None, cell)

212 if cell_text in (

213 "Flexion der Verbaladjektive",

214 "(nichterweiterte) Infinitive",

215 ):

216 break

217 elif cell.kind == NodeKind.TABLE_HEADER_CELL and cell_text not in (

218 "",

219 "Person",

220 ):

221 colspan = int(cell.attrs.get("colspan", "1"))

222 col_headers.append(

223 SpanHeader(

224 cell_text,

225 col_header_index,

226 colspan,

227 )

228 )

229 col_header_index += colspan

230 elif cell.kind == NodeKind.TABLE_CELL: 230 ↛ 208line 230 didn't jump to line 208 because the condition on line 230 was always true

231 if cell_text in (

232 "",

233 "—",

234 "Text",

235 "Person",

236 ) or cell_text.startswith("Flexion:"):

237 col_index += 1

238 elif (

239 cell.contain_node(NodeKind.BOLD)

240 or (

241 len(list(cell.find_html("small"))) > 0

242 and len(list(cell.filter_empty_str_child())) == 1

243 )

244 # Vorlage:Deutsch Verb schwach untrennbar reflexiv

245 or cell.attrs.get("bgcolor", "").lower() == "#f4f4f4"

246 ): # header in cell

247 colspan = int(cell.attrs.get("colspan", "1"))

248 if is_bold_col_header:

249 for bold_node in cell.find_child(NodeKind.BOLD):

250 col_headers.append(

251 SpanHeader(

252 clean_node(wxr, None, bold_node),

253 col_header_index,

254 colspan,

255 )

256 )

257 else:

258 row_header = cell_text

259 col_header_index += colspan

260 else:

261 for form_text in cell_text.splitlines():

262 form_text = form_text.strip(", ")

263 form_raw_tag = ""

264 if ":" in form_text: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true

265 form_raw_tag, form_text = form_text.split(":", 1)

266 form = Form(

267 form=form_text.strip(),

268 source=page_tite,

269 raw_tags=shared_raw_tags,

270 )

271 if form_raw_tag != "": 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true

272 form.raw_tags.append(form_raw_tag)

273 if row_header != "":

274 form.raw_tags.append(row_header)

275 for col_header in col_headers:

276 if (

277 col_index >= col_header.index

278 and col_index

279 < col_header.index + col_header.span

280 ):

281 if col_header.text.endswith("I"): 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true

282 form.raw_tags.append(col_header.text)

283 else:

284 for raw_tag in col_header.text.split():

285 form.raw_tags.append(raw_tag)

286 translate_raw_tags(form)

287 word_entry.forms.append(form)

288 col_index += 1

289

290

291def extract_deklinationsseite_numerale_template(

292 wxr: WiktextractContext,

293 word_entry: WordEntry,

294 t_node: TemplateNode,

295 page_tite: str,

296) -> None:

297 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Numerale

298 expanded_template = wxr.wtp.parse(

299 wxr.wtp.node_to_wikitext(t_node), expand_all=True

300 )

301 for table in expanded_template.find_child(NodeKind.TABLE):

302 col_headers = []

303 for row in table.find_child(NodeKind.TABLE_ROW):

304 row_header = ""

305 row_has_data = row.contain_node(NodeKind.TABLE_CELL)

306 col_index = 0

307 for cell in row.find_child(

308 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

309 ):

310 cell_text = clean_node(wxr, None, cell)

311 if cell_text == "":

312 continue

313 if cell.kind == NodeKind.TABLE_HEADER_CELL:

314 if row_has_data:

315 row_header = cell_text

316 else:

317 col_span = int(cell.attrs.get("colspan", "1"))

318 if col_index == 0 and not row_has_data:

319 col_headers.clear() # new table

320 col_headers.append(

321 SpanHeader(cell_text, col_index, col_span)

322 )

323 col_index += col_span

324 else:

325 word_nodes = []

326 raw_tags = []

327 for cell_child in cell.children:

328 if (

329 isinstance(cell_child, HTMLNode)

330 and cell_child.tag == "br"

331 ):

332 word = clean_node(wxr, None, word_nodes)

333 if word != "":

334 deklinationsseite_numerale_add_form(

335 word_entry,

336 word,

337 page_tite,

338 raw_tags,

339 col_index,

340 row_header,

341 col_headers,

342 )

343 word_nodes.clear()

344 elif (

345 isinstance(cell_child, WikiNode)

346 and cell_child.kind == NodeKind.ITALIC

347 ):

348 raw_tag = clean_node(wxr, None, cell_child).strip(

349 ": "

350 )

351 if raw_tag != "":

352 raw_tags.append(raw_tag)

353 else:

354 word_nodes.append(cell_child)

355 word = clean_node(wxr, None, word_nodes)

356 if word != "":

357 deklinationsseite_numerale_add_form(

358 word_entry,

359 word,

360 page_tite,

361 raw_tags,

362 col_index,

363 row_header,

364 col_headers,

365 )

366 col_index += 1

367

368

369def deklinationsseite_numerale_add_form(

370 word_entry: WordEntry,

371 word: str,

372 source: str,

373 raw_tags: list[str],

374 index: int,

375 row_header: str,

376 col_headers: list[SpanHeader],

377) -> None:

378 form = Form(

379 form=word,

380 source=source,

381 raw_tags=raw_tags,

382 )

383 if row_header != "":

384 form.raw_tags.append(row_header)

385 for col_header in col_headers:

386 if (

387 index >= col_header.index

388 and index < col_header.index + col_header.span

389 ):

390 form.raw_tags.append(col_header.text)

391 translate_raw_tags(form)

392 word_entry.forms.append(form)