Coverage for src/wiktextract/extractor/pl/inflection.py: 95%

233 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_inflection_section( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 lang_code: str, 

16 level_node: WikiNode, 

17) -> None: 

18 from .page import match_sense_index 

19 

20 sense_index = "" 

21 forms = [] 

22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

23 for node in list_item.children: 

24 if isinstance(node, str): 

25 m = re.search(r"\([\d\s,-.]+\)", node) 

26 if m is not None: 26 ↛ 23line 26 didn't jump to line 23 because the condition on line 26 was always true

27 sense_index = m.group(0).strip("()") 

28 elif isinstance(node, TemplateNode): 28 ↛ 23line 28 didn't jump to line 23 because the condition on line 28 was always true

29 forms.extend( 

30 extract_inflection_template(wxr, node, sense_index) 

31 ) 

32 if not level_node.contain_node(NodeKind.LIST): 

33 # have to search recursively cuz "preformatted" node 

34 for node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

35 forms.extend(extract_inflection_template(wxr, node, sense_index)) 

36 

37 for data in page_data: 

38 if data.lang_code == lang_code: 38 ↛ 37line 38 didn't jump to line 37 because the condition on line 38 was always true

39 for form in forms: 

40 if form.sense_index == "" or match_sense_index( 

41 form.sense_index, data 

42 ): 

43 data.forms.append(form) 

44 

45 

46def extract_inflection_template( 

47 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

48) -> list[Form]: 

49 if template_node.template_name in [ 

50 "odmiana-rzeczownik-polski", 

51 "odmiana-rzeczownik-czeski", 

52 ]: 

53 return extract_odmiana_rzeczownik_polski( 

54 wxr, template_node, sense_index 

55 ) 

56 elif template_node.template_name == "odmiana-przymiotnik-polski": 

57 return extract_odmiana_przymiotnik_polski( 

58 wxr, template_node, sense_index 

59 ) 

60 elif template_node.template_name == "odmiana-czasownik-polski": 

61 return extract_odmiana_czasownik_polski(wxr, template_node, sense_index) 

62 elif template_node.template_name == "odmiana-rzeczownik-esperanto": 62 ↛ 65line 62 didn't jump to line 65 because the condition on line 62 was always true

63 return odmiana_rzeczownik_esperanto(wxr, template_node, sense_index) 

64 

65 return [] 

66 

67 

68def extract_odmiana_rzeczownik_polski( 

69 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

70) -> list[Form]: 

71 # noun table 

72 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-polski 

73 forms = [] 

74 for arg_name, arg_value in template_node.template_parameters.items(): 

75 if not isinstance(arg_name, str): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 continue 

77 if arg_name.startswith("Forma"): 

78 raw_tags = ["depr."] if arg_name.endswith("depr") else ["ndepr."] 

79 raw_tags.extend(["M.", "W.", "lm"]) 

80 else: 

81 raw_tags = arg_name.lower().split() 

82 if isinstance(arg_value, str): 

83 arg_value = [arg_value] 

84 if isinstance(arg_value, list): 84 ↛ 74line 84 didn't jump to line 74 because the condition on line 84 was always true

85 form_nodes = [] 

86 current_form_raw_tags = [] 

87 current_form_tags = [] 

88 parsed_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value)) 

89 for node in parsed_arg.children: 

90 if isinstance(node, str) and "/" in node: 

91 slash_index = node.index("/") 

92 form_nodes.append(node[:slash_index]) 

93 form_text = clean_node(wxr, None, form_nodes) 

94 if form_text != "" and form_text != wxr.wtp.title: 

95 form = Form( 

96 form=form_text, 

97 sense_index=sense_index, 

98 raw_tags=raw_tags + current_form_raw_tags, 

99 tags=current_form_tags, 

100 ) 

101 translate_raw_tags(form) 

102 forms.append(form) 

103 form_nodes.clear() 

104 current_form_raw_tags.clear() 

105 current_form_tags.clear() 

106 form_nodes.append(node[slash_index + 1 :]) 

107 elif isinstance(node, TemplateNode): 

108 node_text = clean_node(wxr, None, node) 

109 if node_text.endswith("."): 

110 current_form_raw_tags.append(node_text) 

111 else: 

112 form_nodes.append(node_text) 

113 if node.template_name == "potencjalnie": 

114 current_form_tags.extend(["potential", "rare"]) 

115 else: 

116 form_nodes.append(node) 

117 if len(form_nodes) > 0: 117 ↛ 74line 117 didn't jump to line 74 because the condition on line 117 was always true

118 form_text = clean_node(wxr, None, form_nodes) 

119 if form_text != "" and form_text != wxr.wtp.title: 119 ↛ 74line 119 didn't jump to line 74 because the condition on line 119 was always true

120 form = Form( 

121 form=form_text, 

122 sense_index=sense_index, 

123 raw_tags=raw_tags + current_form_raw_tags, 

124 tags=current_form_tags, 

125 ) 

126 translate_raw_tags(form) 

127 forms.append(form) 

128 return forms 

129 

130 

131def create_noun_form( 

132 form_text: str, 

133 sense_idx: str, 

134 raw_tags: list[str], 

135) -> Form: 

136 form = Form(form=form_text, sense_index=sense_idx, raw_tags=raw_tags) 

137 translate_raw_tags(form) 

138 return form 

139 

140 

141@dataclass 

142class TableHeader: 

143 text: str 

144 start: int 

145 end: int 

146 

147 

148def extract_odmiana_przymiotnik_polski( 

149 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

150) -> list[Form]: 

151 # adj table 

152 # https://pl.wiktionary.org/wiki/Szablon:odmiana-przymiotnik-polski 

153 expanded_node = wxr.wtp.parse( 

154 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

155 ) 

156 forms = [] 

157 for table_tag in expanded_node.find_html_recursively("table"): 

158 forms.extend( 

159 extract_odmiana_przymiotnik_polski_table( 

160 wxr, table_tag, sense_index 

161 ) 

162 ) 

163 return forms 

164 

165 

166def extract_odmiana_przymiotnik_polski_table( 

167 wxr: WiktextractContext, table_tag: WikiNode, sense_index: str 

168) -> list[Form]: 

169 forms = [] 

170 col_headers = [] 

171 for tr_tag in table_tag.find_html("tr"): 

172 th_col_index = 0 

173 for th_tag in tr_tag.find_html("th"): 

174 if th_tag.contain_node(NodeKind.BOLD): 

175 # comparative forms in the second and third table header 

176 raw_tag_nodes = [] 

177 for th_child in th_tag.children: 

178 if ( 

179 isinstance(th_child, WikiNode) 

180 and th_child.kind == NodeKind.BOLD 

181 ): 

182 form = Form( 

183 form=clean_node(wxr, None, th_child), 

184 raw_tags=[clean_node(wxr, None, raw_tag_nodes)], 

185 sense_index=sense_index, 

186 ) 

187 translate_raw_tags(form) 

188 forms.append(form) 

189 else: 

190 raw_tag_nodes.append(th_child) 

191 else: 

192 th_text = clean_node(wxr, None, th_tag) 

193 col_span = int(th_tag.attrs.get("colspan", "1")) 

194 if th_text != "przypadek": 

195 col_headers.append( 

196 TableHeader( 

197 th_text, 

198 th_col_index, 

199 th_col_index + col_span, 

200 ) 

201 ) 

202 th_col_index += col_span 

203 

204 # td tags 

205 th_col_index = 0 

206 td_col_index = 0 

207 row_header = "" 

208 all_header_row = all( 

209 td_tag.attrs.get("class", "") == "forma" 

210 for td_tag in tr_tag.find_html("td") 

211 ) 

212 for td_tag in tr_tag.find_html("td"): 

213 if any(td_tag.find_html("table")): 

214 break 

215 td_text = clean_node(wxr, None, td_tag) 

216 if all_header_row: 

217 col_headers.append( 

218 TableHeader(td_text, th_col_index, th_col_index + 1) 

219 ) 

220 th_col_index += 1 

221 elif "forma" == td_tag.attrs.get("class", ""): 

222 row_header = td_text 

223 else: 

224 col_span = int(td_tag.attrs.get("colspan", "1")) 

225 if td_text == wxr.wtp.title: 

226 td_col_index += col_span 

227 continue 

228 form = Form(form=td_text, sense_index=sense_index) 

229 if row_header != "": 229 ↛ 231line 229 didn't jump to line 231 because the condition on line 229 was always true

230 form.raw_tags.append(row_header) 

231 for col_header in col_headers: 

232 if ( 

233 col_header.start < td_col_index + col_span 

234 and td_col_index < col_header.end 

235 ): 

236 form.raw_tags.append(col_header.text) 

237 td_col_index += col_span 

238 translate_raw_tags(form) 

239 forms.append(form) 

240 return forms 

241 

242 

243def extract_odmiana_czasownik_polski( 

244 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

245) -> list[Form]: 

246 # verb table 

247 # https://pl.wiktionary.org/wiki/Szablon:odmiana-czasownik-polski 

248 expanded_node = wxr.wtp.parse( 

249 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

250 ) 

251 forms = [] 

252 col_headers = [] 

253 for table_tag in expanded_node.find_html_recursively("table"): 

254 forms.extend( 

255 extract_odmiana_czasownik_polski_table( 

256 wxr, table_tag, sense_index, col_headers 

257 ) 

258 ) 

259 return forms 

260 

261 

262def extract_odmiana_czasownik_polski_table( 

263 wxr: WiktextractContext, 

264 table_tag: WikiNode, 

265 sense_index: str, 

266 col_headers: list[TableHeader], 

267) -> list[Form]: 

268 forms = [] 

269 row_headers = [] 

270 for row_index, tr_tag in enumerate(table_tag.find_html("tr")): 

271 has_td_tag = any(t for t in tr_tag.find_html("td")) 

272 th_col_index = 0 

273 for th_tag in tr_tag.find_html("th"): 

274 th_text = clean_node(wxr, None, th_tag) 

275 if th_text in ["forma", "pozostałe formy"]: 

276 continue 

277 if not has_td_tag and "rowspan" not in th_tag.attrs: 

278 col_span = int(th_tag.attrs.get("colspan", "1")) 

279 col_headers.append( 

280 TableHeader(th_text, th_col_index, th_col_index + col_span) 

281 ) 

282 th_col_index += col_span 

283 else: 

284 row_span = int(th_tag.attrs.get("rowspan", "1")) 

285 if th_tag.contain_node(NodeKind.LINK): 

286 for link_node in th_tag.find_child(NodeKind.LINK): 

287 row_headers.append( 

288 TableHeader( 

289 clean_node(wxr, None, link_node), 

290 row_index, 

291 row_index + row_span, 

292 ) 

293 ) 

294 else: 

295 row_headers.append( 

296 TableHeader(th_text, row_index, row_index + row_span) 

297 ) 

298 

299 for row_index, tr_tag in enumerate(table_tag.find_html("tr")): 

300 td_col_index = 0 

301 for td_tag in tr_tag.find_html("td"): 

302 if any(t for t in td_tag.find_html("table")): 

303 break 

304 td_text = clean_node(wxr, None, td_tag) 

305 col_span = int(td_tag.attrs.get("colspan", "1")) 

306 row_span = int(td_tag.attrs.get("rowspan", "1")) 

307 # "Szablon:potencjalnie" uses "{{int:potential-form-tooltip}}" 

308 # not implemented magic word 

309 is_potential_form = False 

310 for span_tag in td_tag.find_html( 

311 "span", attr_name="class", attr_value="potential-form" 

312 ): 

313 is_potential_form = True 

314 

315 for line in td_text.splitlines(): 

316 for form_text in line.split(","): 

317 form_text = form_text.strip() 

318 if form_text == "" or form_text == wxr.wtp.title: 

319 continue 

320 form = Form(form=form_text, sense_index=sense_index) 

321 for col_header in col_headers: 

322 if ( 

323 col_header.start < td_col_index + col_span 

324 and td_col_index < col_header.end 

325 ): 

326 form.raw_tags.append(col_header.text) 

327 for row_header in row_headers: 

328 if ( 

329 row_header.start < row_index + row_span 

330 and row_index < row_header.end 

331 ): 

332 form.raw_tags.append(row_header.text) 

333 translate_raw_tags(form) 

334 if is_potential_form: 

335 form.tags.extend(["potential", "rare"]) 

336 forms.append(form) 

337 

338 td_col_index += col_span 

339 

340 return forms 

341 

342 

343def odmiana_rzeczownik_esperanto( 

344 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str 

345) -> list[Form]: 

346 # noun table 

347 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-esperanto 

348 expanded_node = wxr.wtp.parse( 

349 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

350 ) 

351 forms = [] 

352 col_headers = [] 

353 tags = [] 

354 for span_tag in expanded_node.find_html_recursively( 

355 "span", attr_name="class", attr_value="short-content" 

356 ): 

357 span_text = clean_node(wxr, None, span_tag) 

358 if span_text == "blm": 358 ↛ 360line 358 didn't jump to line 360 because the condition on line 358 was always true

359 tags.append("no-plural") 

360 elif span_text == "blp": 

361 tags.append("no-singulative") 

362 for table_tag in expanded_node.find_html_recursively("table"): 

363 for tr_tag in table_tag.find_html("tr"): 

364 is_header_row = not any(t for t in tr_tag.find_html("td")) 

365 row_header = "" 

366 for th_tag in tr_tag.find_html("th"): 

367 th_text = clean_node(wxr, None, th_tag) 

368 if th_text == "": 

369 continue 

370 if is_header_row: 

371 col_headers.append(th_text) 

372 else: 

373 row_header = th_text 

374 for td_index, td_tag in enumerate(tr_tag.find_html("td")): 

375 form_text = clean_node(wxr, None, td_tag) 

376 td_tags = [] 

377 for _ in td_tag.find_html_recursively( 

378 "span", attr_name="class", attr_value="potential-form" 

379 ): 

380 td_tags.extend(["potential", "rare"]) 

381 if form_text != "" and form_text != wxr.wtp.title: 

382 form = Form( 

383 form=form_text, 

384 sense_index=sense_index, 

385 tags=tags + td_tags, 

386 ) 

387 if row_header != "": 387 ↛ 389line 387 didn't jump to line 389 because the condition on line 387 was always true

388 form.raw_tags.append(row_header) 

389 if td_index < len(col_headers): 389 ↛ 391line 389 didn't jump to line 391 because the condition on line 389 was always true

390 form.raw_tags.append(col_headers[td_index]) 

391 translate_raw_tags(form) 

392 forms.append(form) 

393 

394 return forms