Coverage for src/wiktextract/extractor/el/table.py: 83%

1import re

2from typing import TypeAlias

3from unicodedata import name as unicode_name

5from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode

7from wiktextract.clean import clean_value

8from wiktextract.extractor.el.tags import translate_raw_tags

9from wiktextract.wxr_context import WiktextractContext

11from .models import Form, FormSource, WordEntry

12from .parse_utils import GREEK_LANGCODES, remove_duplicate_forms

14# Shorthand for this file. Could be an import, but it's so simple...

15Node = str | WikiNode

18# GREEK TABLE HEURISTICS:

19# If it's a table for a Greek language entry, if it's in a header or is in

20# italics, it's a header.

21# If it's NOT a Greek entry and has Greek text, it's a header.

24# node_fns are different from template_fns. template_fns are functions that

25# are used to handle how to expand (and otherwise process) templates, while

26# node functions are used when turning any parsed "abstract" nodes into strings.

27def cell_node_fn(

28 node: WikiNode,

29) -> list[Node] | None:

30 """Handle nodes in the parse tree specially."""

31 assert isinstance(node, WikiNode)

32 if node.kind == NodeKind.ITALIC:

33 return ["__I__", *node.children, "__/I__"]

34 if node.kind == NodeKind.BOLD:

35 return ["__B__", *node.children, "__/B__"]

36 # In case someone puts tables inside tables...

37 kind = node.kind

38 if kind in {

39 NodeKind.TABLE_CELL,

40 NodeKind.TABLE_HEADER_CELL,

41 }:

42 return node.children

43 return None

46BOLD_RE = re.compile(r"(__/?[BI]__)")

47TRAILING_NUMBER_RE = re.compile(r"\d+$")

49ARTICLES: set[str] = {

50 "ο",

51 "η",

52 "το",

53 "την",

54 "της",

55 "τον",

56 "τη",

57 "το",

58 "οι",

59 "οι",

60 "τα",

61 "των",

62 "τους",

63 "του",

64 "τις",

65 "τα",

66}

67EXTENDED_ARTICLES = ARTICLES | {

68 "τη(ν)",

69 "ο/η",

70 "του/της",

71 "τον/τη",

72 "τον/τη(ν)",

73 "τον/την",

74 "τους/τις",

75}

76"""Articles to trim from inflection tables / headwords."""

77UNEXPECTED_ARTICLES = {

78 "αι",

79 "ένα",

80 "ένας",

81 "στα",

82 "στη",

83 "στην",

84 "στης",

85 "στις",

86 "στο",

87 "στον",

88 "στου",

89 "στους",

90 "στων",

91 "τ'",

92 "ταις",

93 "τας",

94 "τες",

95 "τη",

96 "τοις",

97 "τω",

98}

99"""Includes contractions, Ancient Greek articles etc."""

100

101

102def process_inflection_section(

103 wxr: WiktextractContext,

104 data: WordEntry,

105 snode: WikiNode,

106 *,

107 source: FormSource = "",

108 top_template_name: str | None = None,

109) -> None:

110 table_nodes: list[tuple[str | None, WikiNode]] = []

111 # template_depth is used as a nonlocal variable in bold_node_handler

112 # to gauge how deep inside a top-level template we are; we want to

113 # collect template data only for the top-level templates that are

114 # visible in the wikitext, not templates inside templates.

115 template_depth = 0

116

117 def table_node_handler_fn(

118 node: WikiNode,

119 ) -> list[str] | str | None:

120 """Insert special markers `__*__` and `__/*__` around bold nodes so

121 that the strings can later be split into "head-word" and "tag-words"

122 parts. Collect incidental stuff, like side-tables, that are often

123 put around the head."""

124 assert isinstance(node, WikiNode)

125 nonlocal template_depth

126 nonlocal top_template_name

127

128 if isinstance(node, TemplateNode):

129 # Recursively expand templates so that even nodes inside the

130 # the templates are handled with bold_node_handler.

131 # Argh. Don't use "node_to_text", that causes bad output...

132 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))

133 if template_depth == 0: 133 ↛ 137line 133 didn't jump to line 137 because the condition on line 133 was always true

134 # We are looking at a top-level template in the original

135 # wikitext.

136 top_template_name = node.template_name

137 new_node = wxr.wtp.parse(expanded)

138

139 template_depth += 1

140 ret = wxr.wtp.node_to_text(

141 new_node, node_handler_fn=table_node_handler_fn

142 )

143 template_depth -= 1

144 if template_depth == 0: 144 ↛ 146line 144 didn't jump to line 146 because the condition on line 144 was always true

145 top_template_name = None

146 return ret

147

148 if node.kind in {

149 NodeKind.TABLE,

150 }:

151 # XXX Handle tables here

152 # template depth and top-level template name

153 nonlocal table_nodes

154 table_nodes.append((top_template_name, node))

155 return [""]

156 return None

157

158 _ = wxr.wtp.node_to_html(snode, node_handler_fn=table_node_handler_fn)

159

160 for template_name, table_node in table_nodes:

161 # XXX template_name

162 parse_table(

163 wxr,

164 table_node,

165 data,

166 data.lang_code in GREEK_LANGCODES,

167 template_name=template_name or "",

168 source=source,

169 )

170

171 data.forms = remove_duplicate_forms(wxr, data.forms)

172

173

174def parse_table(

175 wxr: WiktextractContext,

176 tnode: WikiNode,

177 data: WordEntry,

178 is_greek_entry: bool = False, # Whether the entry is for a Greek word

179 template_name: str = "",

180 *,

181 source: FormSource = "",

182) -> None:

183 """Parse inflection table. Generates 'form' data; 'foos' is a form of 'foo'

184 with the tags ['plural']."""

185 assert (isinstance(tnode, WikiNode) and tnode.kind == NodeKind.TABLE) or (

186 isinstance(tnode, HTMLNode) and tnode.tag == "table"

187 )

188

189 is_html_table = isinstance(tnode, HTMLNode)

190

191 # Some debugging code: if wiktwords is passed a --inflection-tables-file

192 # argument, we save tables to a file for debugging purposes, or for just

193 # getting tables that can be used as test data.

194 if wxr.config.expand_tables: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true

195 with open(wxr.config.expand_tables, "w") as f:

196 f.write(f"{wxr.wtp.title=}\n")

197 text = wxr.wtp.node_to_wikitext(tnode)

198 f.write(f"{text}\n")

199

200 Row: TypeAlias = int

201 Column: TypeAlias = int

202

203 # We complete the table using nested dicts (instead of arrays for

204 # convenience) such that when we come across a node, we push that node's

205 # reference to each coordinate point in the table grid it occupies. Each

206 # grid point can then be checked for if it's been handled already and

207 # skipped if needed.

208 table_grid: dict[Row, dict[Column, WikiNode]] = {}

209

210 first_column_is_headers = True

211

212 for r, row in enumerate(

213 tnode.find_html_recursively("tr")

214 if is_html_table

215 else tnode.find_child_recursively(NodeKind.TABLE_ROW)

216 ):

217 c = 0

218 # print(f"{r=}, {row=}")

219 if r not in table_grid:

220 table_grid[r] = {}

221

222 for cell in (

223 row.find_html(["th", "td"])

224 if is_html_table

225 else row.find_child(

226 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL,

227 )

228 ):

229 while c in table_grid[r]:

230 c += 1

231

232 try:

233 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙

234 colspan = int(cell.attrs.get("colspan", "1")) # 🡘

235 except ValueError:

236 rowspan = 1

237 colspan = 1

238 # print("COL:", col)

239

240 if colspan > 30: 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true

241 wxr.wtp.error(

242 f"Colspan {colspan} over 30, set to 1",

243 sortid="table/128/20250207",

244 )

245 colspan = 1

246 if rowspan > 30: 246 ↛ 247line 246 didn't jump to line 247 because the condition on line 246 was never true

247 wxr.wtp.error(

248 f"Rowspan {rowspan} over 30, set to 1",

249 sortid="table/134/20250207b",

250 )

251 rowspan = 1

252

253 for rr in range(r, r + rowspan):

254 if rr not in table_grid:

255 table_grid[rr] = {}

256 for cc in range(c, c + colspan):

257 table_grid[rr][cc] = cell

258

259 if not table_grid[len(table_grid) - 1]: 259 ↛ 261line 259 didn't jump to line 261 because the condition on line 259 was never true

260 # Last row is empty; traverse backwards to skip empty rows at end

261 last_item = None

262 for i, rowd in reversed(table_grid.items()):

263 if rowd:

264 last_item = i

265 break

266

267 assert last_item is not None

268

269 new_table_grid = dict()

270 for i, rowd in table_grid.items():

271 if i > last_item:

272 continue

273 new_table_grid[i] = rowd

274 table_grid = new_table_grid

275

276 if len(table_grid[0]) == 1: 276 ↛ 278line 276 didn't jump to line 278 because the condition on line 276 was never true

277 # Table is one column in width, no headers on rows

278 first_column_is_headers = False

279

280 if len(table_grid) == 2: 280 ↛ 282line 280 didn't jump to line 282 because the condition on line 280 was never true

281 # There's only one or two rows

282 first_column_is_headers = False

283

284 # Headers are saved in two dict that has their keys made out of tuples

285 # made of their "bookends": so {(1,1), "foo"} for a header that is made

286 # up of the first cell only of a row in the column_hdrs dict.

287 # If we come across a header that has those exact same bookends, only

288 # then do we replace the previous tags with it; if you have overlapping

289 # 'widths', leave them so that we inherit different 'levels' of headers.

290 Spread = tuple[int, int]

291 SpreadDict = dict[Spread, str]

292 # The column and row headers are saved into big dicts: column_hdrs is a dict

293 # whose key is what row or column we are in. The values of that table grid

294 # square is a dict with the bookends (`Spread`) and the tags associated with

295 # those bookends

296 column_hdrs_all: dict[Column, SpreadDict] = {}

297 row_hdrs_all: dict[Row, dict[Column, SpreadDict]] = {}

298

299 forms: list[Form] = []

300 processed: set[WikiNode] = set()

301 # Some tables have cells with stuff like `του` we want to add to the

302 # next cell

303 prefix: str | None = None

304

305 # print(f"{table_grid=}")

306

307 first_cells_are_bold = False

308 found_unformatted_text = False

309

310 for r, row_d in table_grid.items():

311 # Check for previously added row headers that may have spread lower;

312 # Remove old row headers that don't exist on this row.

313 for c, cell in row_d.items():

314 if cell in processed:

315 continue

316 processed.add(cell)

317

318 try:

319 rowspan = int(cell.attrs.get("rowspan", "1")) # 🡙

320 colspan = int(cell.attrs.get("colspan", "1")) # 🡘

321 except ValueError:

322 rowspan = 1

323 colspan = 1

324

325 spans = process_cell_text(wxr, cell)

326

327 if len(spans) <= 0:

328 continue

329

330 if r == 0:

331 if spans[0][0]: # starts_bold

332 first_cells_are_bold = True

333

334 text = clean_value(wxr, " ".join(span[3] for span in spans))

335 # print(f"{text=}")

336

337 this_is_header, unformatted_text = is_header(

338 wxr,

339 cell,

340 spans,

341 is_greek_entry,

342 found_unformatted_text,

343 first_cells_are_bold,

344 )

345

346 if unformatted_text is True:

347 found_unformatted_text = True

348

349 if this_is_header or (c == 0 and first_column_is_headers is True):

350 # Because Greek wiktionary has its own written script to rely

351 # in heuristics, we can use that. It also seems that for

352 # tables in Greek-language entries even if the table doesn't

353 # use proper header cells, you can trust bolding and italics.

354

355 # Currently we don't care which "direction" the header points:

356 # we add the tag to both column headers and row headers, and

357 # rely on that all headers are on only rows or columns that

358 # don't have data cells; ie. headers and data aren't mixed.

359

360 # Each row and each column gets its own header data.

361 # The Spread key is used to keep track which headers should

362 # "overlap": if the spread is different, that should always

363 # mean that one is contained within another and thus they're

364 # not complementary headers, but one "bigger" category and

365 # one "specific" category. If the Spread is identical, then

366 # that's obviously two complementary headers, and the later one

367 # overwrites the other.

368 for rr in range(r, r + rowspan):

369 if rr not in row_hdrs_all:

370 row_hdrs_all[rr] = {c: {(r, r + rowspan): text}}

371 elif c not in row_hdrs_all[rr]: 371 ↛ 376line 371 didn't jump to line 376 because the condition on line 371 was always true

372 row_hdrs_all[rr][c] = {(r, r + rowspan): text}

373 else:

374 # Also overwrites headers with the same "span"; simple

375 # way to have overlapping sections.

376 row_hdrs_all[rr][c][(r, r + rowspan)] = text

377

378 for cc in range(c, c + colspan):

379 if cc not in column_hdrs_all:

380 column_hdrs_all[cc] = {(c, c + colspan): text}

381 else:

382 column_hdrs_all[cc][(c, c + colspan)] = text

383

384 prefix = None

385

386 elif text in ARTICLES:

387 prefix = text

388 else:

389 # cell is data

390 if text in UNEXPECTED_ARTICLES: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true

391 wxr.wtp.debug(

392 f"Found '{text}' in table '{wxr.wtp.title}'",

393 sortid="table/335",

394 )

395 tags: set[str] = set()

396 for cc, vd in row_hdrs_all.get(r, {}).items():

397 if c <= cc: 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true

398 continue

399 for (start, end), tag in vd.items():

400 if start > r or end < r + rowspan:

401 continue

402 tags.add(tag)

403 for (start, end), tag in column_hdrs_all.get(c, {}).items():

404 if start > c or end < c + colspan: 404 ↛ 405line 404 didn't jump to line 405 because the condition on line 404 was never true

405 continue

406 tags.add(tag)

407 texts = [text]

408 if "&" in text:

409 texts = [t.strip() for t in text.split("&")]

410 # Avert your eyes... Python list comprehension syntax amirite

411 texts = [line for text in texts for line in text.splitlines()]

412 if prefix is not None:

413 texts = [f"{prefix} {t}" for t in texts]

414 prefix = None

415 if len(tags) > 0: 415 ↛ 427line 415 didn't jump to line 427 because the condition on line 415 was always true

416 # If a cell has no tags in a table, it's probably a note

417 # or something.

418 forms.extend(

419 Form(

420 form=text,

421 raw_tags=sorted(tags),

422 source=source,

423 )

424 for text in texts

425 )

426 else:

427 wxr.wtp.warning(

428 f"Cell without any tags in table: {text}",

429 sortid="table/300/20250217",

430 )

431

432 # logger.debug(

433 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}"

434 # )

435 # print(forms)

436

437 # # Replace raw_tags with tags if appropriate

438 # for form in forms:

439 # legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags)

440 # # Poses are strings like "adj 1", used in pronunciation data

441 # # to later associate sound data with the correct pos entry.

442 # # Ignored here.

443 # if legit_tags:

444 # form.tags = legit_tags

445 # form.tags.extend(poses)

446 # form.raw_tags = new_raw_tags

447 # print(f"Inside parse_table: {forms=}")

448

449 # If there is no template name (https://el.wiktionary.org/wiki/κρόκος)

450 # we are adding junk anyway. This prevents a Form with empty form, which

451 # is treated as an (non critical) error by src/wiktextract/wiktionary.py

452 #

453 # (I think the κρόκος issue is due to not stopping parsing at headings,

454 # since the two intermingled templates are in different headings...)

455 if forms and template_name: 455 ↛ exitline 455 didn't return from function 'parse_table' because the condition on line 455 was always true

456 data.forms.append(

457 Form(

458 form=template_name,

459 tags=["inflection-template"],

460 source=source,

461 )

462 )

463

464 new_forms = postprocess_table_forms(forms, data.word)

465 data.forms.extend(new_forms)

466

467

468def remove_article_forms(forms: list[Form], word: str) -> list[Form]:

469 """Return a new form list without article forms.

470

471 Articles can appear in two ways:

472 * As a separate form:

473 Ex. https://el.wiktionary.org/wiki/λίθος

474 * As part of a form, inside form.form

475 Ex. most tables

476

477 Used in both headword and table forms. Note that for headword forms, where

478 there is usually no grammatic information, we could also use these articles

479 to populate tags - but since most of the time we remove articles in tables,

480 it was deemed not worth.

481 """

482 # Do not remove article forms for the article pages themselves...

483 if word in ARTICLES:

484 return forms

485

486 new_forms: list[Form] = []

487 for form in forms:

488 if form.form in EXTENDED_ARTICLES:

489 continue

490 parts = form.form.split()

491 if len(parts) > 1 and parts[0] in EXTENDED_ARTICLES:

492 form.form = " ".join(parts[1:])

493 if not form.form: 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true

494 continue

495 new_forms.append(form)

496 return new_forms

497

498

499def postprocess_table_forms(forms: list[Form], word: str) -> list[Form]:

500 """Postprocess table forms.

501

502 * Translate tags

503 * Remove articles (requires original word)

504 * Convert some parens to rare tag

505 * Remove trailing numbers and stars (usu. notes)

506 * Form expansion

507

508 About form expansion, there are two types:

509 * Separators: "/", "-"

510 * Strings inside parens

511

512 The purpose being to go:

513 FROM "θα ζητάν(ε) - ζητούν(ε)"

514 TO ["θα ζητάν", "θα ζητάνε", "θα ζητούν", "θα ζητούνε"]

515

516 References:

517 * https://el.wiktionary.org/wiki/τρώω

518 * https://el.wiktionary.org/wiki/ζητάω < this page is cursed anyway

519 https://el.wiktionary.org/wiki/αγαπάω < use this instead

520 """

521 for form in forms:

522 translate_raw_tags(form)

523

524 clean_forms = remove_article_forms(forms, word)

525

526 for form in clean_forms:

527 # Parens > rare inflection (cf. μπόι)

528 if form.form[0] == "(" and form.form[-1] == ")":

529 form.form = form.form[1:-1]

530 form.tags.append("rare")

531

532 # Remove trailing numbers (usu. notes)

533 # https://el.wiktionary.org/wiki/Καπιτόπουλος

534 form.form = TRAILING_NUMBER_RE.sub("", form.form)

535 # https://el.wiktionary.org/wiki/επιζών

536 form.form = form.form.rstrip("*")

537

538 # Separators

539 separators = ("/", "-")

540 verb_particles = ("θα", "να")

541 separated_forms: list[Form] = []

542 for form in clean_forms:

543 # Assume only one type of separator present atm

544 sep = next((sep for sep in separators if sep in form.form), None)

545 if sep is None:

546 separated_forms.append(form)

547 continue

548

549 # Ignore separator if the original word contained it

550 # Ex. "-ισμός", "η-τάξη" etc.

551 if sep in word:

552 separated_forms.append(form)

553 continue

554

555 # Extract particle if any

556 suffix_particle = ""

557 parts = form.form.split()

558 if len(parts) > 1 and parts[0] in verb_particles:

559 suffix_particle = parts[0]

560 form.form = " ".join(parts[1:])

561

562 for separated in form.form.split(sep):

563 separated_form = form.model_copy(deep=True)

564 separated = separated.strip()

565 if suffix_particle:

566 separated_form.form = f"{suffix_particle} {separated}"

567 else:

568 separated_form.form = separated

569 separated_forms.append(separated_form)

570

571 # Strings inside parens

572 new_forms: list[Form] = []

573 for form in separated_forms:

574 text = form.form

575

576 m = re.match(r"^(.*?)$(.*?)$(.*)$", text)

577 if not m:

578 new_forms.append(form)

579 continue

580

581 before, inside, after = m.groups()

582 expanded = [before + after, before + inside + after]

583 for variant in expanded:

584 new_form = form.model_copy(deep=True)

585 new_form.form = variant

586 new_forms.append(new_form)

587

588 return new_forms

589

590

591def process_cell_text(

592 wxr: WiktextractContext, cell: WikiNode

593) -> list[tuple[bool, bool, bool, str]]:

594 cell_text = wxr.wtp.node_to_text(cell, node_handler_fn=cell_node_fn)

595 cell_text = clean_value(wxr, cell_text)

596 split_text = BOLD_RE.split(cell_text)

597

598 # bold, italics, is greek, text

599 spans: list[tuple[bool, bool, bool, str]] = []

600

601 inside_bold = False

602 inside_italics = False

603 for i, text in enumerate(split_text):

604 text = text.strip()

605 if not text:

606 continue

607 if i % 2 == 0:

608 for ch in text:

609 if not ch.isalpha():

610 continue

611 greek = unicode_name(ch).startswith("GREEK")

612 break

613 else:

614 # no alphanumerics detected

615 continue

616

617 spans.append((inside_bold, inside_italics, greek, text))

618 continue

619 match text:

620 case "__B__":

621 inside_bold = True

622 case "__/B__":

623 inside_bold = False

624 case "__I__":

625 inside_italics = True

626 case "__/I__": 626 ↛ 603line 626 didn't jump to line 603 because the pattern on line 626 always matched

627 inside_italics = False

628

629 return spans

630

631

632UnformattedFound: TypeAlias = bool

633

634

635def is_header(

636 wxr: WiktextractContext,

637 cell: WikiNode,

638 spans: list[tuple[bool, bool, bool, str]],

639 is_greek_entry: bool,

640 unformatted_text_found: bool,

641 first_cells_are_bold: bool,

642) -> tuple[bool, UnformattedFound]:

643 # Container for more complex logic stuff because trying to figure out

644 # if something is a header can get messy.

645 if cell.kind == NodeKind.TABLE_HEADER_CELL:

646 return True, False

647

648 starts_bold, starts_italicized, starts_greek, text = spans[0]

649

650 if "bold" in cell.attrs.get("style", ""): 650 ↛ 651line 650 didn't jump to line 651 because the condition on line 650 was never true

651 starts_bold = True

652 if "italic" in cell.attrs.get("style", ""):

653 starts_italicized = True

654

655 # Not a Greek entry

656 if not is_greek_entry: 656 ↛ 657line 656 didn't jump to line 657 because the condition on line 656 was never true

657 if starts_greek:

658 # If the table is for another language other than Greek, a cell

659 # starting with Greek text is a table header

660 return True, (starts_bold or starts_italicized)

661 else:

662 return False, (starts_bold or starts_italicized)

663

664 # Is a Greek entry

665 if starts_italicized is True:

666 return True, False

667

668 if starts_bold is False: 668 ↛ 671line 668 didn't jump to line 671 because the condition on line 668 was always true

669 return False, True

670

671 if unformatted_text_found:

672 # This is bolded, but we've seen unformatted text before

673 return True, False

674 # print(f"{text=}-> {starts_bold=}, {starts_italicized=}, {starts_greek=}")

675

676 if first_cells_are_bold:

677 return True, False

678

679 wxr.wtp.wiki_notice(

680 f"Can't be sure if bolded text entry '{text}' is a header or not",

681 sortid="table/20250210a",

682 )

683 return False, False

Coverage for src / wiktextract / extractor / el / table.py: 83%

290 statements