Coverage for src/wiktextract/extractor/pl/inflection.py: 94%

237 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-11 10:26 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_inflection_section( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 lang_code: str, 

16 level_node: WikiNode, 

17) -> None: 

18 from .page import match_sense_index 

19 

20 sense_index = "" 

21 forms = [] 

22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

23 for node in list_item.children: 

24 if isinstance(node, str): 

25 m = re.search(r"\([\d\s,-.]+\)", node) 

26 if m is not None: 26 ↛ 23line 26 didn't jump to line 23 because the condition on line 26 was always true

27 sense_index = m.group(0).strip("()") 

28 elif isinstance(node, TemplateNode): 28 ↛ 23line 28 didn't jump to line 23 because the condition on line 28 was always true

29 forms.extend( 

30 extract_inflection_template(wxr, node, sense_index) 

31 ) 

32 if not level_node.contain_node(NodeKind.LIST): 

33 # have to search recursively cuz "preformatted" node 

34 for node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

35 forms.extend(extract_inflection_template(wxr, node, sense_index)) 

36 

37 for data in page_data: 

38 if data.lang_code == lang_code: 38 ↛ 37line 38 didn't jump to line 37 because the condition on line 38 was always true

39 for form in forms: 

40 if form.sense_index == "" or match_sense_index( 

41 form.sense_index, data 

42 ): 

43 data.forms.append(form) 

44 

45 

46def extract_inflection_template( 

47 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

48) -> list[Form]: 

49 if template_node.template_name in [ 

50 "odmiana-rzeczownik-polski", 

51 "odmiana-rzeczownik-czeski", 

52 ]: 

53 return extract_odmiana_rzeczownik_polski( 

54 wxr, template_node, sense_index 

55 ) 

56 elif template_node.template_name == "odmiana-przymiotnik-polski": 

57 return extract_odmiana_przymiotnik_polski( 

58 wxr, template_node, sense_index 

59 ) 

60 elif template_node.template_name == "odmiana-czasownik-polski": 

61 return extract_odmiana_czasownik_polski(wxr, template_node, sense_index) 

62 elif template_node.template_name == "odmiana-rzeczownik-esperanto": 62 ↛ 65line 62 didn't jump to line 65 because the condition on line 62 was always true

63 return odmiana_rzeczownik_esperanto(wxr, template_node, sense_index) 

64 

65 return [] 

66 

67 

68def extract_odmiana_rzeczownik_polski( 

69 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

70) -> list[Form]: 

71 # noun table 

72 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-polski 

73 forms = [] 

74 for arg_name, arg_value in template_node.template_parameters.items(): 

75 if not isinstance(arg_name, str): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 continue 

77 if arg_name.startswith("Forma"): 

78 raw_tags = ["depr."] if arg_name.endswith("depr") else ["ndepr."] 

79 raw_tags.extend(["M.", "W.", "lm"]) 

80 else: 

81 raw_tags = arg_name.lower().split() 

82 if isinstance(arg_value, str): 

83 arg_value = [arg_value] 

84 if isinstance(arg_value, list): 84 ↛ 74line 84 didn't jump to line 74 because the condition on line 84 was always true

85 form_nodes = [] 

86 current_form_raw_tags = [] 

87 current_form_tags = [] 

88 parsed_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value)) 

89 for node in parsed_arg.children: 

90 if isinstance(node, str) and "/" in node: 

91 slash_index = node.index("/") 

92 form_nodes.append(node[:slash_index]) 

93 form_text = clean_node(wxr, None, form_nodes) 

94 if form_text != "" and form_text != wxr.wtp.title: 

95 form = Form( 

96 form=form_text, 

97 sense_index=sense_index, 

98 raw_tags=raw_tags + current_form_raw_tags, 

99 tags=current_form_tags, 

100 ) 

101 translate_raw_tags(form) 

102 forms.append(form) 

103 form_nodes.clear() 

104 current_form_raw_tags.clear() 

105 current_form_tags.clear() 

106 form_nodes.append(node[slash_index + 1 :]) 

107 elif isinstance(node, TemplateNode): 

108 node_text = clean_node(wxr, None, node) 

109 if node_text.endswith("."): 

110 current_form_raw_tags.append(node_text) 

111 else: 

112 form_nodes.append(node_text) 

113 if node.template_name == "potencjalnie": 

114 current_form_tags.extend(["potential", "rare"]) 

115 else: 

116 form_nodes.append(node) 

117 if len(form_nodes) > 0: 117 ↛ 74line 117 didn't jump to line 74 because the condition on line 117 was always true

118 form_text = clean_node(wxr, None, form_nodes) 

119 if form_text != "" and form_text != wxr.wtp.title: 119 ↛ 74line 119 didn't jump to line 74 because the condition on line 119 was always true

120 form = Form( 

121 form=form_text, 

122 sense_index=sense_index, 

123 raw_tags=raw_tags + current_form_raw_tags, 

124 tags=current_form_tags, 

125 ) 

126 translate_raw_tags(form) 

127 forms.append(form) 

128 return forms 

129 

130 

131def create_noun_form( 

132 form_text: str, 

133 sense_idx: str, 

134 raw_tags: list[str], 

135) -> Form: 

136 form = Form(form=form_text, sense_index=sense_idx, raw_tags=raw_tags) 

137 translate_raw_tags(form) 

138 return form 

139 

140 

141@dataclass 

142class TableHeader: 

143 text: str 

144 start: int 

145 end: int 

146 

147 

148def extract_odmiana_przymiotnik_polski( 

149 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

150) -> list[Form]: 

151 # adj table 

152 # https://pl.wiktionary.org/wiki/Szablon:odmiana-przymiotnik-polski 

153 expanded_node = wxr.wtp.parse( 

154 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

155 ) 

156 forms = [] 

157 for table_tag in expanded_node.find_html_recursively("table"): 

158 forms.extend( 

159 extract_odmiana_przymiotnik_polski_table( 

160 wxr, table_tag, sense_index 

161 ) 

162 ) 

163 return forms 

164 

165 

166def extract_odmiana_przymiotnik_polski_table( 

167 wxr: WiktextractContext, table_tag: WikiNode, sense_index: str 

168) -> list[Form]: 

169 forms = [] 

170 col_headers = [] 

171 for tr_tag in table_tag.find_html("tr"): 

172 th_col_index = 0 

173 for th_tag in tr_tag.find_html("th"): 

174 if th_tag.contain_node(NodeKind.BOLD): 

175 # comparative forms in the second and third table header 

176 raw_tag_nodes = [] 

177 for th_child in th_tag.children: 

178 if ( 

179 isinstance(th_child, WikiNode) 

180 and th_child.kind == NodeKind.BOLD 

181 ): 

182 raw_tag = clean_node(wxr, None, raw_tag_nodes) 

183 form = Form( 

184 form=clean_node(wxr, None, th_child), 

185 sense_index=sense_index, 

186 ) 

187 if raw_tag != "": 187 ↛ 190line 187 didn't jump to line 190 because the condition on line 187 was always true

188 form.raw_tags.append(raw_tag) 

189 translate_raw_tags(form) 

190 if form.form not in [ 190 ↛ 177line 190 didn't jump to line 177 because the condition on line 190 was always true

191 "", 

192 "nie stopniuje się", 

193 wxr.wtp.title, 

194 ]: 

195 forms.append(form) 

196 else: 

197 raw_tag_nodes.append(th_child) 

198 else: 

199 th_text = clean_node(wxr, None, th_tag) 

200 col_span = int(th_tag.attrs.get("colspan", "1")) 

201 if th_text != "przypadek": 

202 col_headers.append( 

203 TableHeader( 

204 th_text, 

205 th_col_index, 

206 th_col_index + col_span, 

207 ) 

208 ) 

209 th_col_index += col_span 

210 

211 # td tags 

212 th_col_index = 0 

213 td_col_index = 0 

214 row_header = "" 

215 all_header_row = all( 

216 td_tag.attrs.get("class", "") == "forma" 

217 for td_tag in tr_tag.find_html("td") 

218 ) 

219 for td_tag in tr_tag.find_html("td"): 

220 if any(td_tag.find_html("table")): 

221 break 

222 td_text = clean_node(wxr, None, td_tag) 

223 if all_header_row: 

224 col_headers.append( 

225 TableHeader(td_text, th_col_index, th_col_index + 1) 

226 ) 

227 th_col_index += 1 

228 elif "forma" == td_tag.attrs.get("class", ""): 

229 row_header = td_text 

230 else: 

231 col_span = int(td_tag.attrs.get("colspan", "1")) 

232 if td_text == wxr.wtp.title: 

233 td_col_index += col_span 

234 continue 

235 form = Form(form=td_text, sense_index=sense_index) 

236 if row_header != "": 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true

237 form.raw_tags.append(row_header) 

238 for col_header in col_headers: 

239 if ( 

240 col_header.start < td_col_index + col_span 

241 and td_col_index < col_header.end 

242 ): 

243 form.raw_tags.append(col_header.text) 

244 td_col_index += col_span 

245 translate_raw_tags(form) 

246 forms.append(form) 

247 return forms 

248 

249 

250def extract_odmiana_czasownik_polski( 

251 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

252) -> list[Form]: 

253 # verb table 

254 # https://pl.wiktionary.org/wiki/Szablon:odmiana-czasownik-polski 

255 expanded_node = wxr.wtp.parse( 

256 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

257 ) 

258 forms = [] 

259 col_headers = [] 

260 for table_tag in expanded_node.find_html_recursively("table"): 

261 forms.extend( 

262 extract_odmiana_czasownik_polski_table( 

263 wxr, table_tag, sense_index, col_headers 

264 ) 

265 ) 

266 return forms 

267 

268 

269def extract_odmiana_czasownik_polski_table( 

270 wxr: WiktextractContext, 

271 table_tag: WikiNode, 

272 sense_index: str, 

273 col_headers: list[TableHeader], 

274) -> list[Form]: 

275 forms = [] 

276 row_headers = [] 

277 for row_index, tr_tag in enumerate(table_tag.find_html("tr")): 

278 has_td_tag = any(t for t in tr_tag.find_html("td")) 

279 th_col_index = 0 

280 for th_tag in tr_tag.find_html("th"): 

281 th_text = clean_node(wxr, None, th_tag) 

282 if th_text in ["forma", "pozostałe formy"]: 

283 continue 

284 if not has_td_tag and "rowspan" not in th_tag.attrs: 

285 col_span = int(th_tag.attrs.get("colspan", "1")) 

286 col_headers.append( 

287 TableHeader(th_text, th_col_index, th_col_index + col_span) 

288 ) 

289 th_col_index += col_span 

290 else: 

291 row_span = int(th_tag.attrs.get("rowspan", "1")) 

292 if th_tag.contain_node(NodeKind.LINK): 

293 for link_node in th_tag.find_child(NodeKind.LINK): 

294 row_headers.append( 

295 TableHeader( 

296 clean_node(wxr, None, link_node), 

297 row_index, 

298 row_index + row_span, 

299 ) 

300 ) 

301 else: 

302 row_headers.append( 

303 TableHeader(th_text, row_index, row_index + row_span) 

304 ) 

305 

306 for row_index, tr_tag in enumerate(table_tag.find_html("tr")): 

307 td_col_index = 0 

308 for td_tag in tr_tag.find_html("td"): 

309 if any(t for t in td_tag.find_html("table")): 

310 break 

311 td_text = clean_node(wxr, None, td_tag) 

312 col_span = int(td_tag.attrs.get("colspan", "1")) 

313 row_span = int(td_tag.attrs.get("rowspan", "1")) 

314 # "Szablon:potencjalnie" uses "{{int:potential-form-tooltip}}" 

315 # not implemented magic word 

316 is_potential_form = False 

317 for span_tag in td_tag.find_html( 

318 "span", attr_name="class", attr_value="potential-form" 

319 ): 

320 is_potential_form = True 

321 

322 for line in td_text.splitlines(): 

323 for form_text in line.split(","): 

324 form_text = form_text.strip() 

325 if form_text == "" or form_text == wxr.wtp.title: 

326 continue 

327 form = Form(form=form_text, sense_index=sense_index) 

328 for col_header in col_headers: 

329 if ( 

330 col_header.start < td_col_index + col_span 

331 and td_col_index < col_header.end 

332 ): 

333 form.raw_tags.append(col_header.text) 

334 for row_header in row_headers: 

335 if ( 

336 row_header.start < row_index + row_span 

337 and row_index < row_header.end 

338 ): 

339 form.raw_tags.append(row_header.text) 

340 translate_raw_tags(form) 

341 if is_potential_form: 

342 form.tags.extend(["potential", "rare"]) 

343 forms.append(form) 

344 

345 td_col_index += col_span 

346 

347 return forms 

348 

349 

350def odmiana_rzeczownik_esperanto( 

351 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

352) -> list[Form]: 

353 # noun table 

354 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-esperanto 

355 expanded_node = wxr.wtp.parse( 

356 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

357 ) 

358 forms = [] 

359 col_headers = [] 

360 tags = [] 

361 for span_tag in expanded_node.find_html_recursively( 

362 "span", attr_name="class", attr_value="short-content" 

363 ): 

364 span_text = clean_node(wxr, None, span_tag) 

365 if span_text == "blm": 365 ↛ 367line 365 didn't jump to line 367 because the condition on line 365 was always true

366 tags.append("no-plural") 

367 elif span_text == "blp": 

368 tags.append("no-singulative") 

369 for table_tag in expanded_node.find_html_recursively("table"): 

370 for tr_tag in table_tag.find_html("tr"): 

371 is_header_row = not any(t for t in tr_tag.find_html("td")) 

372 row_header = "" 

373 for th_tag in tr_tag.find_html("th"): 

374 th_text = clean_node(wxr, None, th_tag) 

375 if th_text == "": 

376 continue 

377 if is_header_row: 

378 col_headers.append(th_text) 

379 else: 

380 row_header = th_text 

381 for td_index, td_tag in enumerate(tr_tag.find_html("td")): 

382 form_text = clean_node(wxr, None, td_tag) 

383 td_tags = [] 

384 for _ in td_tag.find_html_recursively( 

385 "span", attr_name="class", attr_value="potential-form" 

386 ): 

387 td_tags.extend(["potential", "rare"]) 

388 if form_text != "" and form_text != wxr.wtp.title: 

389 form = Form( 

390 form=form_text, 

391 sense_index=sense_index, 

392 tags=tags + td_tags, 

393 ) 

394 if row_header != "": 394 ↛ 396line 394 didn't jump to line 396 because the condition on line 394 was always true

395 form.raw_tags.append(row_header) 

396 if td_index < len(col_headers): 396 ↛ 398line 396 didn't jump to line 398 because the condition on line 396 was always true

397 form.raw_tags.append(col_headers[td_index]) 

398 translate_raw_tags(form) 

399 forms.append(form) 

400 

401 return forms