Coverage for src/wiktextract/extractor/pl/inflection.py: 94%

1import re

2from dataclasses import dataclass

4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from .models import Form, WordEntry

9from .tags import translate_raw_tags

12def extract_inflection_section(

13 wxr: WiktextractContext,

14 page_data: list[WordEntry],

15 lang_code: str,

16 level_node: WikiNode,

17) -> None:

18 from .page import match_sense_index

20 sense_index = ""

21 forms = []

22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

23 for node in list_item.children:

24 if isinstance(node, str):

25 m = re.search(r"\([\d\s,-.]+\)", node)

26 if m is not None: 26 ↛ 23line 26 didn't jump to line 23 because the condition on line 26 was always true

27 sense_index = m.group(0).strip("()")

28 elif isinstance(node, TemplateNode): 28 ↛ 23line 28 didn't jump to line 23 because the condition on line 28 was always true

29 forms.extend(

30 extract_inflection_template(wxr, node, sense_index)

31 )

32 if not level_node.contain_node(NodeKind.LIST):

33 # have to search recursively cuz "preformatted" node

34 for node in level_node.find_child_recursively(NodeKind.TEMPLATE):

35 forms.extend(extract_inflection_template(wxr, node, sense_index))

37 for data in page_data:

38 if data.lang_code == lang_code: 38 ↛ 37line 38 didn't jump to line 37 because the condition on line 38 was always true

39 for form in forms:

40 if form.sense_index == "" or match_sense_index(

41 form.sense_index, data

42 ):

43 data.forms.append(form)

46def extract_inflection_template(

47 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str

48) -> list[Form]:

49 if template_node.template_name in [

50 "odmiana-rzeczownik-polski",

51 "odmiana-rzeczownik-czeski",

52 ]:

53 return extract_odmiana_rzeczownik_polski(

54 wxr, template_node, sense_index

55 )

56 elif template_node.template_name == "odmiana-przymiotnik-polski":

57 return extract_odmiana_przymiotnik_polski(

58 wxr, template_node, sense_index

59 )

60 elif template_node.template_name == "odmiana-czasownik-polski":

61 return extract_odmiana_czasownik_polski(wxr, template_node, sense_index)

62 elif template_node.template_name == "odmiana-rzeczownik-esperanto": 62 ↛ 65line 62 didn't jump to line 65 because the condition on line 62 was always true

63 return odmiana_rzeczownik_esperanto(wxr, template_node, sense_index)

65 return []

68def extract_odmiana_rzeczownik_polski(

69 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str

70) -> list[Form]:

71 # noun table

72 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-polski

73 forms = []

74 for arg_name, arg_value in template_node.template_parameters.items():

75 if not isinstance(arg_name, str): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 continue

77 if arg_name.startswith("Forma"):

78 raw_tags = ["depr."] if arg_name.endswith("depr") else ["ndepr."]

79 raw_tags.extend(["M.", "W.", "lm"])

80 else:

81 raw_tags = arg_name.lower().split()

82 if isinstance(arg_value, str):

83 arg_value = [arg_value]

84 if isinstance(arg_value, list): 84 ↛ 74line 84 didn't jump to line 74 because the condition on line 84 was always true

85 form_nodes = []

86 current_form_raw_tags = []

87 current_form_tags = []

88 parsed_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value))

89 for node in parsed_arg.children:

90 if isinstance(node, str) and "/" in node:

91 slash_index = node.index("/")

92 form_nodes.append(node[:slash_index])

93 form_text = clean_node(wxr, None, form_nodes)

94 if form_text != "" and form_text != wxr.wtp.title:

95 form = Form(

96 form=form_text,

97 sense_index=sense_index,

98 raw_tags=raw_tags + current_form_raw_tags,

99 tags=current_form_tags,

100 )

101 translate_raw_tags(form)

102 forms.append(form)

103 form_nodes.clear()

104 current_form_raw_tags.clear()

105 current_form_tags.clear()

106 form_nodes.append(node[slash_index + 1 :])

107 elif isinstance(node, TemplateNode):

108 node_text = clean_node(wxr, None, node)

109 if node_text.endswith("."):

110 current_form_raw_tags.append(node_text)

111 else:

112 form_nodes.append(node_text)

113 if node.template_name == "potencjalnie":

114 current_form_tags.extend(["potential", "rare"])

115 else:

116 form_nodes.append(node)

117 if len(form_nodes) > 0: 117 ↛ 74line 117 didn't jump to line 74 because the condition on line 117 was always true

118 form_text = clean_node(wxr, None, form_nodes)

119 if form_text != "" and form_text != wxr.wtp.title: 119 ↛ 74line 119 didn't jump to line 74 because the condition on line 119 was always true

120 form = Form(

121 form=form_text,

122 sense_index=sense_index,

123 raw_tags=raw_tags + current_form_raw_tags,

124 tags=current_form_tags,

125 )

126 translate_raw_tags(form)

127 forms.append(form)

128 return forms

129

130

131def create_noun_form(

132 form_text: str,

133 sense_idx: str,

134 raw_tags: list[str],

135) -> Form:

136 form = Form(form=form_text, sense_index=sense_idx, raw_tags=raw_tags)

137 translate_raw_tags(form)

138 return form

139

140

141@dataclass

142class TableHeader:

143 text: str

144 start: int

145 end: int

146

147

148def extract_odmiana_przymiotnik_polski(

149 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str

150) -> list[Form]:

151 # adj table

152 # https://pl.wiktionary.org/wiki/Szablon:odmiana-przymiotnik-polski

153 expanded_node = wxr.wtp.parse(

154 wxr.wtp.node_to_wikitext(template_node), expand_all=True

155 )

156 forms = []

157 for table_tag in expanded_node.find_html_recursively("table"):

158 forms.extend(

159 extract_odmiana_przymiotnik_polski_table(

160 wxr, table_tag, sense_index

161 )

162 )

163 return forms

164

165

166def extract_odmiana_przymiotnik_polski_table(

167 wxr: WiktextractContext, table_tag: WikiNode, sense_index: str

168) -> list[Form]:

169 forms = []

170 col_headers = []

171 for tr_tag in table_tag.find_html("tr"):

172 th_col_index = 0

173 for th_tag in tr_tag.find_html("th"):

174 if th_tag.contain_node(NodeKind.BOLD):

175 # comparative forms in the second and third table header

176 raw_tag_nodes = []

177 for th_child in th_tag.children:

178 if (

179 isinstance(th_child, WikiNode)

180 and th_child.kind == NodeKind.BOLD

181 ):

182 raw_tag = clean_node(wxr, None, raw_tag_nodes)

183 form = Form(

184 form=clean_node(wxr, None, th_child),

185 sense_index=sense_index,

186 )

187 if raw_tag != "": 187 ↛ 190line 187 didn't jump to line 190 because the condition on line 187 was always true

188 form.raw_tags.append(raw_tag)

189 translate_raw_tags(form)

190 if form.form not in [ 190 ↛ 177line 190 didn't jump to line 177 because the condition on line 190 was always true

191 "",

192 "nie stopniuje się",

193 wxr.wtp.title,

194 ]:

195 forms.append(form)

196 else:

197 raw_tag_nodes.append(th_child)

198 else:

199 th_text = clean_node(wxr, None, th_tag)

200 col_span = int(th_tag.attrs.get("colspan", "1"))

201 if th_text != "przypadek":

202 col_headers.append(

203 TableHeader(

204 th_text,

205 th_col_index,

206 th_col_index + col_span,

207 )

208 )

209 th_col_index += col_span

210

211 # td tags

212 th_col_index = 0

213 td_col_index = 0

214 row_header = ""

215 all_header_row = all(

216 td_tag.attrs.get("class", "") == "forma"

217 for td_tag in tr_tag.find_html("td")

218 )

219 for td_tag in tr_tag.find_html("td"):

220 if any(td_tag.find_html("table")):

221 break

222 td_text = clean_node(wxr, None, td_tag)

223 if all_header_row:

224 col_headers.append(

225 TableHeader(td_text, th_col_index, th_col_index + 1)

226 )

227 th_col_index += 1

228 elif "forma" == td_tag.attrs.get("class", ""):

229 row_header = td_text

230 else:

231 col_span = int(td_tag.attrs.get("colspan", "1"))

232 if td_text == wxr.wtp.title:

233 td_col_index += col_span

234 continue

235 form = Form(form=td_text, sense_index=sense_index)

236 if row_header != "": 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true

237 form.raw_tags.append(row_header)

238 for col_header in col_headers:

239 if (

240 col_header.start < td_col_index + col_span

241 and td_col_index < col_header.end

242 ):

243 form.raw_tags.append(col_header.text)

244 td_col_index += col_span

245 translate_raw_tags(form)

246 forms.append(form)

247 return forms

248

249

250def extract_odmiana_czasownik_polski(

251 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str

252) -> list[Form]:

253 # verb table

254 # https://pl.wiktionary.org/wiki/Szablon:odmiana-czasownik-polski

255 expanded_node = wxr.wtp.parse(

256 wxr.wtp.node_to_wikitext(template_node), expand_all=True

257 )

258 forms = []

259 col_headers = []

260 for table_tag in expanded_node.find_html_recursively("table"):

261 forms.extend(

262 extract_odmiana_czasownik_polski_table(

263 wxr, table_tag, sense_index, col_headers

264 )

265 )

266 return forms

267

268

269def extract_odmiana_czasownik_polski_table(

270 wxr: WiktextractContext,

271 table_tag: WikiNode,

272 sense_index: str,

273 col_headers: list[TableHeader],

274) -> list[Form]:

275 forms = []

276 row_headers = []

277 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):

278 has_td_tag = any(t for t in tr_tag.find_html("td"))

279 th_col_index = 0

280 for th_tag in tr_tag.find_html("th"):

281 th_text = clean_node(wxr, None, th_tag)

282 if th_text in ["forma", "pozostałe formy"]:

283 continue

284 if not has_td_tag and "rowspan" not in th_tag.attrs:

285 col_span = int(th_tag.attrs.get("colspan", "1"))

286 col_headers.append(

287 TableHeader(th_text, th_col_index, th_col_index + col_span)

288 )

289 th_col_index += col_span

290 else:

291 row_span = int(th_tag.attrs.get("rowspan", "1"))

292 if th_tag.contain_node(NodeKind.LINK):

293 for link_node in th_tag.find_child(NodeKind.LINK):

294 row_headers.append(

295 TableHeader(

296 clean_node(wxr, None, link_node),

297 row_index,

298 row_index + row_span,

299 )

300 )

301 else:

302 row_headers.append(

303 TableHeader(th_text, row_index, row_index + row_span)

304 )

305

306 for row_index, tr_tag in enumerate(table_tag.find_html("tr")):

307 td_col_index = 0

308 for td_tag in tr_tag.find_html("td"):

309 if any(t for t in td_tag.find_html("table")):

310 break

311 td_text = clean_node(wxr, None, td_tag)

312 col_span = int(td_tag.attrs.get("colspan", "1"))

313 row_span = int(td_tag.attrs.get("rowspan", "1"))

314 # "Szablon:potencjalnie" uses "{{int:potential-form-tooltip}}"

315 # not implemented magic word

316 is_potential_form = False

317 for span_tag in td_tag.find_html(

318 "span", attr_name="class", attr_value="potential-form"

319 ):

320 is_potential_form = True

321

322 for line in td_text.splitlines():

323 for form_text in line.split(","):

324 form_text = form_text.strip()

325 if form_text == "" or form_text == wxr.wtp.title:

326 continue

327 form = Form(form=form_text, sense_index=sense_index)

328 for col_header in col_headers:

329 if (

330 col_header.start < td_col_index + col_span

331 and td_col_index < col_header.end

332 ):

333 form.raw_tags.append(col_header.text)

334 for row_header in row_headers:

335 if (

336 row_header.start < row_index + row_span

337 and row_index < row_header.end

338 ):

339 form.raw_tags.append(row_header.text)

340 translate_raw_tags(form)

341 if is_potential_form:

342 form.tags.extend(["potential", "rare"])

343 forms.append(form)

344

345 td_col_index += col_span

346

347 return forms

348

349

350def odmiana_rzeczownik_esperanto(

351 wxr: WiktextractContext, template_node: TemplateNode, sense_index: str

352) -> list[Form]:

353 # noun table

354 # https://pl.wiktionary.org/wiki/Szablon:odmiana-rzeczownik-esperanto

355 expanded_node = wxr.wtp.parse(

356 wxr.wtp.node_to_wikitext(template_node), expand_all=True

357 )

358 forms = []

359 col_headers = []

360 tags = []

361 for span_tag in expanded_node.find_html_recursively(

362 "span", attr_name="class", attr_value="short-content"

363 ):

364 span_text = clean_node(wxr, None, span_tag)

365 if span_text == "blm": 365 ↛ 367line 365 didn't jump to line 367 because the condition on line 365 was always true

366 tags.append("no-plural")

367 elif span_text == "blp":

368 tags.append("no-singulative")

369 for table_tag in expanded_node.find_html_recursively("table"):

370 for tr_tag in table_tag.find_html("tr"):

371 is_header_row = not any(t for t in tr_tag.find_html("td"))

372 row_header = ""

373 for th_tag in tr_tag.find_html("th"):

374 th_text = clean_node(wxr, None, th_tag)

375 if th_text == "":

376 continue

377 if is_header_row:

378 col_headers.append(th_text)

379 else:

380 row_header = th_text

381 for td_index, td_tag in enumerate(tr_tag.find_html("td")):

382 form_text = clean_node(wxr, None, td_tag)

383 td_tags = []

384 for _ in td_tag.find_html_recursively(

385 "span", attr_name="class", attr_value="potential-form"

386 ):

387 td_tags.extend(["potential", "rare"])

388 if form_text != "" and form_text != wxr.wtp.title:

389 form = Form(

390 form=form_text,

391 sense_index=sense_index,

392 tags=tags + td_tags,

393 )

394 if row_header != "": 394 ↛ 396line 394 didn't jump to line 396 because the condition on line 394 was always true

395 form.raw_tags.append(row_header)

396 if td_index < len(col_headers): 396 ↛ 398line 396 didn't jump to line 398 because the condition on line 396 was always true

397 form.raw_tags.append(col_headers[td_index])

398 translate_raw_tags(form)

399 forms.append(form)

400

401 return forms