Coverage for src/wiktextract/extractor/el/pos.py: 80%

1import re

2from collections.abc import Iterator

3from functools import partial

4from typing import Any, TypeAlias

5from unicodedata import name as unicode_name

7from wikitextprocessor import (

8 HTMLNode,

9 NodeKind,

10 TemplateArgs,

11 TemplateNode,

12 WikiNode,

13)

14from wikitextprocessor.parser import LEVEL_KIND_FLAGS

16from wiktextract import WiktextractContext

17from wiktextract.extractor.el.tags import translate_raw_tags

18from wiktextract.page import clean_node

20from .head import parse_head

21from .linkages import process_linkage_section

22from .models import (

23 Example,

24 FormOf,

25 FormSource,

26 Linkage,

27 Sense,

28 TemplateData,

29 WordEntry,

30)

31from .parse_utils import (

32 GREEK_LANGCODES,

33 expand_suffix_forms,

34 parse_lower_heading,

35 remove_duplicate_forms,

36)

37from .section_titles import POS_HEADINGS, Heading, POSName

38from .table import parse_table, process_inflection_section, remove_article_forms

39from .tags_utils import convert_tags_in_sense

40from .text_utils import (

41 ENDING_NUMBER_RE,

42 normalized_int,

43)

44from .translations import process_translations

46# from wiktextract.wxr_logging import logger

49def process_pos(

50 wxr: WiktextractContext,

51 node: WikiNode,

52 data: WordEntry,

53 prev_data: WordEntry | None, # data from the last entry in this language

54 # the "noun" in "Noun 2"

55 pos: POSName,

56 title: str,

57 # the "2" in "Noun 2"

58 pos_tags: list[str],

59 pos_num: int = -1,

60) -> WordEntry | None:

61 """Process a part-of-speech section, like 'Noun'. `data` provides basic

62 data common with other POS sections, like pronunciation or etymology."""

64 # Metadata for different part-of-speech kinds.

65 # print(f"{pos_title=}, {pos_tags=}, {pos_num=}")

66 data.pos = pos # the internal/translated name for the POS

67 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings.

68 for pos_tag in pos_tags:

69 if pos_tag not in data.tags: 69 ↛ 68line 69 didn't jump to line 68 because the condition on line 69 was always true

70 data.tags.append(pos_tag)

72 wxr.wtp.start_subsection(title)

74 # Sound data associated with this POS might be coming from a shared

75 # section, in which case we've tried to tag the sound data with its

76 # pos name + number if possible. Filter out stuff that doesn't fit.

77 # This is actually pretty common, but if the edition has proper hierarchies

78 # for this, doing this step might be unnecessary.

79 new_sounds = []

80 for sound in data.sounds: 80 ↛ 81line 80 didn't jump to line 81 because the loop on line 80 never started

81 if len(sound.poses) == 0:

82 # This sound data wasn't tagged with any specific pos section(s), so

83 # we add it to everything; this is basically the default behavior.

84 new_sounds.append(sound)

85 else:

86 for sound_pos in sound.poses:

87 m = ENDING_NUMBER_RE.search(sound_pos)

88 if m is not None:

89 s_num = normalized_int(m.group(1).strip())

90 s_pos = sound_pos[: m.start()].strip().lower()

91 else:

92 s_pos = sound_pos.strip().lower()

93 s_num = -1

94 sound_meta = POS_HEADINGS[s_pos]

95 s_pos = sound_meta["pos"]

96 if s_pos == data.pos and s_num == data.pos_num:

97 new_sounds.append(sound)

98 data.sounds = new_sounds

100 # Get child nodes *except* headings (= LEVEL).

101 pos_contents = list(

102 node.invert_find_child(LEVEL_KIND_FLAGS, include_empty_str=True)

103 # include empty string only for debug printing?

104 )

105

106 if len(pos_contents) == 0 or ( 106 ↛ 113line 106 didn't jump to line 113 because the condition on line 106 was never true

107 len(pos_contents) == 1

108 and isinstance(pos_contents[0], str)

109 # Just a single newline or whitespace after heading.

110 and not pos_contents[0].strip()

111 ):

112 # Most probably a bad article.

113 wxr.wtp.error(

114 "No body for Part-of-speech section.", sortid="simple/pos/271"

115 )

116 data.senses.append(Sense(tags=["no-gloss"]))

117 return data

118

119 # split_nodes_to_lines returns lists items on their own 'line'

120 node_lines = list(split_nodes_to_lines(pos_contents))

121

122 glosses_index = None

123 glosses_lists = []

124 for i, line in enumerate(node_lines):

125 # Looking at the "rump" after glosses lists starts, it's simplest

126 # just to pull all the list nodes, and handle them. Anything after

127 # or inbetween (like categories, extra templates, tables and images)

128 # can be ignored.

129 if (

130 len(line) == 1

131 and isinstance(line[0], WikiNode)

132 and line[0].kind == NodeKind.LIST

133 and (line[0].sarg != ":")

134 ):

135 if glosses_index is None: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true

136 glosses_index = i

137 glosses_lists.append(line[0])

138

139 if glosses_index is None:

140 # if nothing found, accept ":" nodes

141 for i, line in enumerate(node_lines):

142 if (

143 len(line) == 1

144 and isinstance(line[0], WikiNode)

145 and line[0].kind == NodeKind.LIST

146 ):

147 if glosses_index is None: 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true

148 glosses_index = i

149 glosses_lists.append(line[0])

150

151 if glosses_index is None: 151 ↛ 154line 151 didn't jump to line 154 because the condition on line 151 was never true

152 # Could not find any glosses.

153 # logger.info(f" //// {wxr.wtp.title}\n MISSING GLOSSES")

154 wxr.wtp.wiki_notice("Missing glosses", sortid="pos/20250121")

155 data.tags.append("no-gloss")

156

157 template_data: list[TemplateData] = []

158 category_data: list[str] = []

159 table_nodes: list[tuple[str | None, WikiNode]] = []

160 # template_depth is used as a nonlocal variable in bold_node_handler

161 # to gauge how deep inside a top-level template we are; we want to

162 # collect template data only for the top-level templates that are

163 # visible in the wikitext, not templates inside templates.

164 template_depth = 0

165 top_template_name: str | None = None

166

167 def bold_node_handler_fn(

168 node: WikiNode,

169 ) -> list[str | WikiNode] | str | None:

170 """Insert special markers `__*S__` and `__*E__` around bold nodes so

171 that the strings can later be split into "head-word" and "tag-words"

172 parts. Collect incidental stuff, like side-tables, that are often

173 put around the head."""

174 assert isinstance(node, WikiNode)

175 kind = node.kind

176 nonlocal template_depth

177 nonlocal top_template_name

178 if kind == NodeKind.BOLD or (

179 isinstance(node, HTMLNode)

180 and (

181 node.tag == "span"

182 and "style" in node.attrs

183 and (

184 "bold" in node.attrs["style"]

185 # Special handling for output for stuff in arabic script

186 or node.attrs["style"] == "color:black; font-size:200%;"

187 )

188 or node.tag == "b"

189 or node.tag == "strong"

190 )

191 ):

192 # These are word forms almost always

193 return ["__B__", *node.children, "__/B__"]

194 elif kind == NodeKind.ITALIC or (

195 isinstance(node, HTMLNode)

196 and (

197 (

198 node.tag == "span"

199 and "style" in node.attrs

200 and "italic" in node.attrs["style"]

201 )

202 or node.tag == "i"

203 or node.tag == "em"

204 )

205 ):

206 # These are almost always tag words; often 'kai' isn't italicized,

207 # for example.

208 return ["__I__", *node.children, "__/I__"]

209 elif isinstance(node, TemplateNode):

210 # Recursively expand templates so that even nodes inside the

211 # the templates are handled with bold_node_handler.

212 # Argh. Don't use "node_to_text", that causes bad output...

213 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))

214 if template_depth == 0: 214 ↛ 228line 214 didn't jump to line 228 because the condition on line 214 was always true

215 # We are looking at a top-level template in the original

216 # wikitext.

217 template_data.append(

218 TemplateData(

219 name=node.template_name,

220 args={

221 str(k): clean_node(wxr, None, v)

222 for k, v in node.template_parameters.items()

223 },

224 expansion=expanded,

225 )

226 )

227 top_template_name = node.template_name

228 new_node = wxr.wtp.parse(expanded)

229

230 template_depth += 1

231 ret = wxr.wtp.node_to_text(

232 new_node, node_handler_fn=bold_node_handler_fn

233 )

234 template_depth -= 1

235 if template_depth == 0: 235 ↛ 237line 235 didn't jump to line 237 because the condition on line 235 was always true

236 top_template_name = None

237 return ret

238 elif kind == NodeKind.LINK:

239 if not isinstance(node.largs[0][0], str): 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 return None

241 if node.largs[0][0].startswith("Κατηγορία:"):

242 category_data.append(node.largs[0][0][len("Κατηγορία:") :])

243 return [""]

244 # Special case for meta-links like Πρότυπο:ετ that generate

245 # both a category link and :category link that is actually

246 # displayed as a link, but for our purposes we want to ignore

247 # that it is a link; it's a tag.

248 if node.largs[0][0].startswith(":Κατηγορία:"):

249 # unpacking a list-comprehension, unpacking into a list

250 # seems to be more performant than adding lists together.

251 return [

252 wxr.wtp.node_to_text(

253 node.largs[1:2] or node.largs[0],

254 node_handler_fn=bold_node_handler_fn,

255 )

256 # output the "visible" half of the link.

257 ]

258 if node.largs[0][0].startswith("Αρχείο:"): 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true

259 return [""]

260 # Often forms are 'formatted' with links, so let's mark these

261 # too.

262 return [

263 "__L__",

264 wxr.wtp.node_to_text(

265 node.largs[1:2] or node.largs[0],

266 node_handler_fn=bold_node_handler_fn,

267 ),

268 # output the "visible" half of the link.

269 # XXX collect link data if it turns out to be important.

270 "__/L__",

271 ]

272 # print(f"{node.largs=}")

273

274 elif kind in { 274 ↛ 280line 274 didn't jump to line 280 because the condition on line 274 was never true

275 NodeKind.TABLE,

276 }:

277 # XXX Handle tables here

278 # template depth and top-level template name

279 nonlocal table_nodes

280 table_nodes.append((top_template_name, node))

281 return [""]

282 return None

283

284 # Get Head Line

285 # Head *should* be immediately before the glosses...

286 # print(node_lines[:glosses_index])

287 found_head = False

288

289 for line in reversed(node_lines[:glosses_index]):

290 template_data = []

291 template_depth = 0

292 stripped = (

293 wxr.wtp.node_to_text(line, node_handler_fn=bold_node_handler_fn)

294 .removeprefix(":")

295 .strip()

296 )

297 if not stripped:

298 continue

299 if not found_head and (parsed_forms := parse_head(wxr, stripped)): 299 ↛ 289line 299 didn't jump to line 289 because the condition on line 299 was always true

300 for form in parsed_forms:

301 translate_raw_tags(form)

302

303 if (

304 data.lang_code == "el"

305 and not data.word.startswith("-")

306 # If there are spaces around the "/", we don't parse the

307 # header correctly, so just skip the expansion.

308 # Ex. "πρωτοπόρος, -α / -ος, -ο"

309 # Remove this check if that ever gets fixed.

310 and len(parsed_forms) == 3

311 # Only adjectives or participles

312 and (

313 data.pos == "adj"

314 or (data.pos == "verb" and "participle" in data.tags)

315 )

316 ):

317 parsed_forms = expand_suffix_forms(parsed_forms)

318

319 parsed_forms = remove_article_forms(parsed_forms, data.word)

320 data.forms.extend(parsed_forms)

321 found_head = True

322

323 if not found_head: 323 ↛ 329line 323 didn't jump to line 329 because the condition on line 323 was never true

324 # There are a bunch of Greek Wiktionary articles with POS sections

325 # without heads, but they seem to always follow ones with heads;

326 # in this case, the result is just not including any `forms` field

327 # for these (or copying the previous one).

328

329 if prev_data is None:

330 wxr.wtp.wiki_notice(

331 f"Part of speech missing head: {wxr.wtp.title}",

332 sortid="pos/460/20250104",

333 )

334 else:

335 # No head found, copy previous (in this language)

336 data.forms = [

337 form.model_copy(deep=True) for form in prev_data.forms

338 ]

339

340 if len(template_data) > 0: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true

341 data.head_templates = template_data

342 # logger.info(

343 # f" //// {wxr.wtp.title}\n >>>"

344 # + "\n >>>".join(repr(td) for td in template_data)

345 # )

346

347 for template_name, table_node in table_nodes: 347 ↛ 349line 347 didn't jump to line 349 because the loop on line 347 never started

348 # XXX template_name

349 parse_table(

350 wxr,

351 table_node,

352 data,

353 data.lang_code in GREEK_LANGCODES,

354 template_name=template_name or "",

355 source="inflection",

356 )

357

358 data.forms = remove_duplicate_forms(wxr, data.forms)

359

360 # Ignore images and files

361 # 2025-01-17 13:10:10,035 INFO: //// Ηρόδοτος

362 # //// [[Αρχείο:Herodotus Massimo Inv124478.jpg|200px|thumb|[[προτομή]] του Ηροδότου]]

363

364 # Have to ignore {{(( specifically. Creates columns.

365 # 2025-01-17 13:10:11,059 INFO: //// κάνω

366 # //// {{((|width=97%}}

367

368 # logger.info(f"<<<<<<<<< {wxr.wtp.title}\n<<<" + "\n<<<".join(pparts))

369 # see: free -> {{en-verb-'free'}} creates a floating inflection table

370 # followed by the usual head template

371

372 # see: τηλεομοιοτυπία

373 # '''{{PAGENAME}}''' {{θ}}

374 # theta is basically {{f|...}}

375

376 # see: θηλυκός

377 # '''{{PAGENAME}}, -ή''' ''και'' '''-ιά, -ό'''

378 # pagename, -e and -ia, -o, no indication of what these mean

379

380 # Ιόνια νησιά

381 # >>>'''{{PAGENAME}}''' ''πληθυντικός του'' [[ιόνιος|Ιόνιο]] [[νησί]]

382 # plural of 'Ionian island'

383

384 # >>>>>>>>> free

385 # >>>{{en-adj-r}} # floating table

386 # >>>{{τ|en|{{PAGENAME}}}}, ''συγκριτικός'' '''freer''', ''υπερθετικός'' '''freest'''

387 # pretty consistent bolding and italics

388

389 # genus

390 # {{τ|la|{{PAGENAME}}}} {{ο}} ([[γενική]]: generis) (3ης [[κλίση]]ς)

391

392 # ουδέτερος

393 # >>>'''{{PAGENAME}} -η -ο'''

394

395 # καφέ

396 # >>>'''{{PAGENAME}}''' {{ακλ|επίθ}}

397 # aklitos, uninflected

398

399 # καφέ

400 # >>>[[Αρχείο:Cafe Museum - Vienna (5402363918).jpg|μικρογραφία|τραπέζι σε βιενέζικο '''καφέ''']]

401 # >>>'''{{PAGENAME}}''' {{ο}} {{ακλ|ουδ}}

402 # Ignore images

403

404 # κρόκος

405 # >>>{| align="right"

406 # >>>

407 # >>>|-

408 # >>>

409 # >>>|[[Αρχείο:Crocus sativus1.jpg|thumb|150px|Άνθη '''κρόκου''' (''Crocus sativus'').]]

410 # >>>

411 # >>>

412 # >>>|[[Αρχείο:Raw egg.jpg|thumb|200px|Ο '''κρόκος''' ενός αβγού.]]

413 # >>>

414 # >>>

415 # >>>|}

416 # >>>

417 # >>>'''{{PAGENAME}}''' {{α}}

418

419 # p

420 # >>>'''{{PAGENAME}}''' ''πεζό'' (''κεφαλαίο:'' '''{{l|P|la}}''')

421 # lowercase, uppercase

422

423 # Δημόκριτος

424 # >>>'''{{PAGENAME}}'''

425 # >>># {{όνομα||α}}

426 # >>>{{clear}}

427 # Clear is just formatting to move the line down where there are empty

428 # margins.

429

430 # BIG ASSUMPTION: let us assume that Greek Wiktionary doesn't use templates

431 # that generate multiline text that is part of head. That is, we can see

432 # each newline because they are in strings, and when something that does

433 # generate virtual newlines (list) pops up, that's when the head portion

434 # ends.

435 # Greek Wiktionary head sections look like this:

436 # > Pre-head templates that create side-tables, like inflections

437 # > Possible formatting templates like {{clear}} that should be ignored

438 # > Head template last before glosses list

439 # > Clear again...

440 # > Glosses list tree, where we can stop.

441 # We can create "lines" of these by looping over the items in pos_content

442 # and looking for newlines in strings, because that's where they mainly

443 # should be (except side-table templates). We handle earlier lines

444 # differently than the last line before the glosses list, which is the

445 # head.

446

447 # return None

448

449 # ======================

450

451 ### Glosses after head ###

452 # parts = []

453 got_senses = False

454 for lst in glosses_lists:

455 # Wiktionaries handle glosses the usual way: with numbered lists.

456 # Each list entry is a gloss, sometimes with subglosses, but with

457 # Simple English Wiktionary that seems rare.

458 # logger.debug(f"{lst}")

459 senses = recurse_glosses(wxr, lst, data)

460 if len(senses) > 0: 460 ↛ 454line 460 didn't jump to line 454 because the condition on line 460 was always true

461 got_senses = True

462 for sense in senses:

463 translate_raw_tags(sense)

464 data.senses.extend(senses)

465

466 if not got_senses and len(glosses_lists) > 0: 466 ↛ 467line 466 didn't jump to line 467 because the condition on line 466 was never true

467 wxr.wtp.error(

468 "POS had a list, but the list did not return senses.",

469 sortid="simple/pos/313",

470 )

471

472 # If there is no list, clump everything into one gloss.

473 # if not len(glosses_lists > 0):

474 # sense = Sense()

475 # found_gloss = parse_gloss(wxr, sense, pos_contents[i:])

476 # if found_gloss is True or len(sense.raw_tags) > 0:

477 # convert_tags_in_sense(sense)

478 # if len(sense.glosses) == 0 and "no-gloss" not in sense.tags:

479 # sense.tags.append("no-gloss")

480 # data.senses.append(sense)

481

482 if len(data.senses) == 0: 482 ↛ 483line 482 didn't jump to line 483 because the condition on line 482 was never true

483 data.senses.append(Sense(tags=["no-gloss"]))

484

485 #####

486 #####

487 # TEMP DEBUG PRINTS

488

489 pos_sublevels = list(

490 node.find_child(LEVEL_KIND_FLAGS)

491 # include empty string only for debug printing?

492 )

493

494 for sl in pos_sublevels: 494 ↛ 495line 494 didn't jump to line 495 because the loop on line 494 never started

495 subtitle = clean_node(wxr, None, sl.largs).lower().strip()

496

497 heading_type, *_ = parse_lower_heading(wxr, subtitle)

498

499 if heading_type == Heading.Translations:

500 process_translations(wxr, data, sl)

501 elif heading_type == Heading.Infl:

502 source: FormSource = "inflection"

503 if data.lang_code in ("el", "grc"):

504 source = "conjugation"

505 process_inflection_section(wxr, data, sl, source=source)

506 elif heading_type in (

507 Heading.Related,

508 Heading.Synonyms,

509 Heading.Antonyms,

510 Heading.Transliterations,

511 ):

512 process_linkage_section(wxr, data, sl, heading_type)

513 # if heading_type not in (

514 # Heading.Translations,

515 # Heading.Ignored,

516 # Heading.Infl,

517 # Heading.Related,

518 # Heading.Synonyms,

519 # Heading.Antonyms,

520 # Heading.Derived,

521 # # We're going to ignore homonyms because they're

522 # # only tangentially related, like anagrams

523 # Heading.Homonyms,

524 # ):

525 # # ...

526 # expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(sl))

527 # # text = clean_node(wxr, None, sl)

528 # logger.warning(

529 # f"""

530 # {wxr.wtp.title}: {heading_type}, {ok=}

531 # {expanded}

532

533 # ###########################

534 # """

535 # )

536

537 #####

538 #####

539 return data

540

541

542PARENS_BEFORE_RE = re.compile(r"\s*(\([^()]+\)\s*)+")

543ITER_PARENS_RE = re.compile(r"\(([^()]+)\)")

544

545

546def bold_node_fn(

547 node: WikiNode,

548) -> list[str | WikiNode] | None:

549 """Handle nodes in the parse tree specially."""

550 # print(f"{node=}")

551 if node.kind == NodeKind.ITALIC:

552 return ["__I__", *node.children, "__/I__"]

553 if node.kind == NodeKind.BOLD:

554 return ["__B__", *node.children, "__/B__"]

555 # if node.kind == NodeKind.LINK:

556 # if not isinstance(node.largs[0][0], str):

557 # return None

558 # return [

559 # "__L__",

560 # # unpacking a list-comprehension, unpacking into a list

561 # # seems to be more performant than adding lists together.

562 # *(

563 # wxr.wtp.node_to_text(

564 # node.largs[1:2] or node.largs[0],

565 # )

566 # # output the "visible" half of the link.

567 # ),

568 # # XXX collect link data if it turns out to be important.

569 # "__/L__",

570 # ]

571 # # print(f"{node.largs=}")

572 return None

573

574

575def extract_form_of_templates(

576 wxr: WiktextractContext,

577 parent_sense: Sense | WordEntry,

578 t_node: TemplateNode,

579 siblings: list[str | WikiNode],

580 siblings_index: int,

581) -> None:

582 """Parse form_of for nouns, adjectives and verbs.

583

584 Supports:

585 1. κλ | generic | form_of

586 2. γρ | generic | form_of

587 3. πτώση/πτώσεις | nouns, adjectives etc. | form_of and tags

588 4. υπο/υποκ | nouns | form_of

589 5. μεγ/μεγεθ | nouns | form_of

590 6. ρημ τύπος | verbs | form_of

591 7. μτχ | verbs | form_of

592

593 * References:

594 1. https://el.wiktionary.org/wiki/Πρότυπο:κλ

595 2. https://el.wiktionary.org/wiki/Module:άλλημορφή

596 3. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_κλιτικούς_τύπους

597 4. https://el.wiktionary.org/wiki/Πρότυπο:υπο

598 5. https://el.wiktionary.org/wiki/Πρότυπο:μεγ

599 6. https://el.wiktionary.org/wiki/Πρότυπο:ρημ_τύπος

600 7. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_μετοχές

601 """

602 t_name = t_node.template_name

603

604 basic_extract = partial(

605 extract_form_of_templates_basic,

606 wxr,

607 parent_sense,

608 siblings,

609 siblings_index,

610 t_name,

611 t_node,

612 )

613 # Generic

614 if t_name == "κλ":

615 return basic_extract(extract_argument=2)

616

617 # Notes:

618 # * All occurrences in wiktionary have at least one argument

619 # * Only handle cases where the second argument refers to a form:

620 # μορφ / μορφή / λόγια μορφή του, etc.

621 # and ignore those mistakenly used as synonym templates

622 if (

623 t_name in ("γρ", "γραφή του", "alter")

624 and 2 in t_node.template_parameters

625 ):

626 second_arg = t_node.template_parameters[2]

627 second_arg_str = clean_node(wxr, None, second_arg)

628 if "μορφ" in second_arg_str:

629 return basic_extract(extract_argument=1)

630

631 # Nouns and adjectives

632 inflection_t_names = ("πτώσεις", "πτώση")

633 if (

634 any(name in t_name for name in inflection_t_names)

635 and 1 in t_node.template_parameters

636 ):

637 return extract_form_of_templates_ptosi(wxr, parent_sense, t_node)

638

639 # Nouns

640 # Note that the "diminutive/augmentative" tags will be added later on

641 # via translation of the "υποκοριστικό/μεγεθυντικό" raw_tags

642 for template_name in ("υπο", "υποκ", "μεγ", "μεγεθ"):

643 if t_name == template_name and 1 in t_node.template_parameters:

644 return basic_extract(extract_argument=1)

645

646 # Verbs

647 if t_name == "ρημ τύπος":

648 return basic_extract(extract_argument=2)

649

650 if t_name.startswith("μτχ"):

651 return basic_extract(extract_argument=1)

652

653

654def extract_form_of_templates_basic(

655 wxr: WiktextractContext,

656 parent_sense: Sense | WordEntry,

657 siblings: list[str | WikiNode],

658 sibling_index: int,

659 t_name: str,

660 t_node: TemplateNode,

661 extract_argument: int | str,

662) -> None:

663 t_args = t_node.template_parameters

664 if extract_argument in t_args:

665 lemma = clean_node(wxr, None, t_args[extract_argument]).strip()

666 else:

667 # mtxpp template has no args, consume the next links for the

668 # form_of field

669 # cf. https://github.com/tatuylonen/wiktextract/issues/1372

670 wxr.wtp.wiki_notice(

671 f"Form-of template does not have lemma data: {t_name}, {t_args=}",

672 sortid="pos/570/20250517",

673 )

674 links: list[str | WikiNode] = []

675 for node in siblings[sibling_index + 1 :]:

676 if not (

677 (isinstance(node, str) and node.strip() == "")

678 or (isinstance(node, WikiNode) and node.kind == NodeKind.LINK)

679 ):

680 break

681 links.append(node)

682 lemma = clean_node(wxr, None, links).strip()

683

684 if lemma:

685 form_of = FormOf(word=lemma)

686 parent_sense.form_of.append(form_of)

687 else:

688 wxr.wtp.wiki_notice(

689 "Lemma extract from form-of template was empty or whitespace:"

690 f"{t_name}, {t_args=}, {lemma=}",

691 sortid="pos/609/20250925",

692 )

693

694

695PTOSI_GENDER_INFLECTION_MAP = {

696 "θηλ": "feminine",

697 "αρσ": "masculine",

698 "ουδ": "neuter",

699}

700PTOSI_NUMBER_INFLECTION_MAP = {

701 "εν": "singular",

702 "πλ": "plural",

703}

704PTOSI_CASE_INFLECTION_MAP = {

705 "Ο": "nominative",

706 "Α": "accusative",

707 "Γ": "genitive",

708 "Κ": "vocative",

709}

710

711

712def extract_form_of_templates_ptosi(

713 wxr: WiktextractContext,

714 parent_sense: Sense | WordEntry,

715 t_node: TemplateNode,

716) -> None:

717 """Parse form_of for nouns and adjectives.

718

719 Supports:

720 * [gender του] πτώση-πτώσεις templates

721

722 Notes:

723 * The πτώση-πτώσεις templates contains:

724 * Case(s): 1 for πτώση, >1 for πτώσεις - in uppercase characters.

725 * Number: "εν" (singular) or "πλ" (plural)

726 Examples:

727 * {{πτώσηΑεν|κόρφος}} > accusative | singular

728 * {{πτώσειςΟΚπλ|κόρφος}} > nominative, vocative | plural

729 """

730 t_name = t_node.template_name

731 inflection_t_names = ("πτώσεις", "πτώση")

732 tags: list[str] = []

733

734 # Parse and consume gender if any

735 if "-" in t_name:

736 # Cf. {{ουδ του-πτώσειςΟΑΚεν|καλός|grc}}

737 gender, inflection = t_name.split("-")

738 code = gender[:3]

739 try:

740 gender_tag = PTOSI_GENDER_INFLECTION_MAP[code]

741 except KeyError:

742 # Bad template name.

743 return

744 tags.append(gender_tag)

745 else:

746 inflection = t_name

747

748 # Remove πτώση-πτώσεις prefix

749 for prefix in inflection_t_names: 749 ↛ 754line 749 didn't jump to line 754 because the loop on line 749 didn't complete

750 if inflection.startswith(prefix):

751 inflection = inflection[len(prefix) :]

752 break

753

754 try:

755 lowercase = "".join(ch for ch in inflection if ch.islower())

756 number = PTOSI_NUMBER_INFLECTION_MAP[lowercase]

757 uppercase = [ch for ch in inflection if not ch.islower()]

758 cases = [PTOSI_CASE_INFLECTION_MAP[ch] for ch in uppercase]

759 except KeyError:

760 # Bad template name.

761 return

762

763 tags.extend([*cases, number])

764 tags.sort() # For the tests, but also good practice

765

766 lemma = clean_node(wxr, None, t_node.template_parameters[1])

767 form_of = FormOf(word=lemma)

768 parent_sense.form_of.append(form_of)

769 parent_sense.tags.extend(tags)

770

771

772def parse_gloss(

773 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode]

774) -> bool:

775 """Take what is preferably a line of text and extract tags and a gloss from

776 it. The data is inserted into parent_sense, and for recursion purposes

777 we return a boolean that tells whether there was any gloss text in a

778 lower node."""

779 if len(contents) == 0: 779 ↛ 780line 779 didn't jump to line 780 because the condition on line 779 was never true

780 return False

781

782 for i, t_node in enumerate(contents):

783 if isinstance(t_node, TemplateNode):

784 extract_form_of_templates(wxr, parent_sense, t_node, contents, i)

785

786 template_tags: list[str] = []

787

788 bl_linkages: list[Linkage] = []

789 no_gloss_but_keep_anyway = False

790

791 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:

792 nonlocal bl_linkages

793 if name == "βλ":

794 for k, v in ht.items():

795 if isinstance(k, int):

796 bl_linkages.append(Linkage(word=clean_node(wxr, None, v)))

797 return ""

798 return None

799

800 # The rest of the text.

801 text = clean_node(

802 wxr,

803 parent_sense,

804 contents,

805 template_fn=bl_template_handler_fn,

806 node_handler_fn=bold_node_fn,

807 )

808

809 if len(bl_linkages) > 0:

810 parent_sense.related.extend(bl_linkages)

811 no_gloss_but_keep_anyway = True

812

813 if not text.strip():

814 if len(bl_linkages) <= 0: 814 ↛ 815line 814 didn't jump to line 815 because the condition on line 814 was never true

815 return False

816

817 # print(f" ============ {contents=}, {text=}")

818

819 # Greek Wiktionary uses a lot of template-less tags.

820 if parens_n := PARENS_BEFORE_RE.match(text):

821 blocks = ITER_PARENS_RE.findall(parens_n.group(0))

822 # print(f"{blocks=}")

823 kept_blocks: list[str] = []

824 forms: list[str] = []

825 raw_tag_texts: list[str] = []

826 for block in blocks:

827 if block_has_non_greek_text(block):

828 # Keep parentheses with non-greek text with gloss text)

829 kept_blocks.extend(("(", block, ") "))

830 continue

831 nforms, nraw_tag_texts = extract_forms_and_tags(block)

832 forms.extend(nforms)

833 raw_tag_texts.extend(nraw_tag_texts)

834 # print(f"{forms=}, {raw_tag_texts=}")

835 if forms: 835 ↛ 837line 835 didn't jump to line 837 because the condition on line 835 was never true

836 # print(f"{forms=}")

837 parent_sense.related.extend(Linkage(word=form) for form in forms)

838 parent_sense.raw_tags.extend(raw_tag_texts)

839 kept_blocks.append(text[parens_n.end() :])

840 text = "".join(kept_blocks)

841

842 text = re.sub(r"__/?[IB]__", "", text)

843

844 if len(template_tags) > 0: 844 ↛ 845line 844 didn't jump to line 845 because the condition on line 844 was never true

845 parent_sense.raw_tags.extend(template_tags)

846

847 if len(text) > 0:

848 parent_sense.glosses.append(text)

849 return True

850

851 if no_gloss_but_keep_anyway: 851 ↛ 855line 851 didn't jump to line 855 because the condition on line 851 was always true

852 parent_sense.tags.append("no-gloss")

853 return True

854

855 return False

856

857

858Related: TypeAlias = Linkage

859Synonym: TypeAlias = Linkage

860Antonym: TypeAlias = Linkage

861

862

863def recurse_glosses1(

864 wxr: WiktextractContext,

865 parent_sense: Sense,

866 node: WikiNode,

867) -> tuple[

868 list[Sense],

869 list[Example],

870 list[Related],

871 list[Synonym],

872 list[Antonym],

873]:

874 """Helper function for recurse_glosses"""

875 # print(f"{node=}")

876

877 ret_senses: list[Sense] = []

878 ret_examples: list[Example] = []

879 ret_related: list[Related] = []

880 ret_synonyms: list[Synonym] = []

881 ret_antonyms: list[Antonym] = []

882 found_gloss = False

883

884 # Pydantic stuff doesn't play nice with Tatu's manual dict manipulation

885 # functions, so we'll use a dummy dict here that we then check for

886 # content and apply to `parent_sense`.

887 dummy_parent: dict[str, Any] = {}

888

889 related_linkages: list[Linkage] = []

890 example_is_synonym = False

891 example_is_antonym = False

892

893 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None:

894 nonlocal related_linkages

895 nonlocal example_is_synonym

896 nonlocal example_is_antonym

897 # Sometimes the bl-templates point to synonyms or antonyms, instead

898 # of just "related"; we save them, and if example_is_xxxnym is true,

899 # we later return them as xxxnyms.

900 if name == "βλ":

901 for k, v in ht.items():

902 if isinstance(k, int):

903 related_linkages.append(

904 Linkage(word=clean_node(wxr, None, v))

905 )

906 return ""

907 if name in ("συνων", "συνών"):

908 example_is_synonym = True

909 return ""

910 if name in ("αντων", "αντών"):

911 example_is_antonym = True

912 return ""

913 return None

914

915 # List nodes contain only LIST_ITEM nodes, which may contain sub-LIST nodes.

916 if node.kind == NodeKind.LIST:

917 list_ret: tuple[

918 list[Sense],

919 list[Example],

920 list[Related],

921 list[Synonym],

922 list[Antonym],

923 ] = ([], [], [], [], [])

924 for child in node.children:

925 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 925 ↛ 927line 925 didn't jump to line 927 because the condition on line 925 was never true

926 # This should never happen

927 wxr.wtp.error(

928 f"{child=} is direct child of NodeKind.LIST",

929 sortid="simple/pos/44",

930 )

931 continue

932 (

933 senses,

934 examples,

935 related,

936 synonyms,

937 antonyms,

938 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child)

939 list_ret[0].extend(senses)

940 list_ret[1].extend(examples)

941 list_ret[2].extend(related)

942 list_ret[3].extend(synonyms)

943 list_ret[4].extend(antonyms)

944 return list_ret

945

946 elif node.kind == NodeKind.LIST_ITEM: 946 ↛ 1048line 946 didn't jump to line 1048 because the condition on line 946 was always true

947 # Split at first LIST node found

948 split_at = next(

949 (

950 i

951 for i, c in enumerate(node.children)

952 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST

953 ),

954 len(node.children),

955 )

956 contents = node.children[:split_at]

957 sublists = node.children[split_at:]

958

959 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like

960 # `#` or `##:`: the token that appears at the very start of a line that

961 # is used to parse the depth and structure of lists.

962 # `#` Item 1

963 # `##` Item 1.1

964 # `##*` Example 1.1

965 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"):

966 # This is either a quotation or example.

967 text = clean_node(

968 wxr, dummy_parent, contents, template_fn=bl_template_handler_fn

969 ).strip("⮡ \n")

970

971 # print(f"{contents=}, {text=}, {related_linkages=}")

972

973 if example_is_synonym or example_is_antonym:

974 link_linkages = []

975 for snode in contents:

976 if not isinstance(snode, WikiNode):

977 continue

978 if snode.kind == NodeKind.LINK:

979 link_linkages.append(

980 Linkage(

981 word=clean_node(wxr, None, snode.largs[0][0])

982 )

983 )

984 else:

985 for link in snode.find_child_recursively(NodeKind.LINK): 985 ↛ 986line 985 didn't jump to line 986 because the loop on line 985 never started

986 link_linkages.append(

987 Linkage(word=clean_node(wxr, None, link))

988 )

989

990 # print("=====")

991 # print(f"{link_linkages=}")

992

993 if example_is_synonym:

994 return [], [], [], link_linkages + related_linkages, []

995 elif example_is_antonym: 995 ↛ 998line 995 didn't jump to line 998 because the condition on line 995 was always true

996 return [], [], [], [], link_linkages + related_linkages

997

998 if len(related_linkages) > 0:

999 # parent_sense.related.extend(bl_linkages)

1000 # related_linkages = []

1001 # if not text.strip():

1002 return [], [], related_linkages, [], []

1003

1004 example_is_synonym = False

1005 example_is_antonym = False

1006

1007 if not text.strip(): 1007 ↛ 1008line 1007 didn't jump to line 1008 because the condition on line 1007 was never true

1008 return [], [], [], [], []

1009

1010 example = Example(text=text)

1011 # logger.debug(f"{wxr.wtp.title}/example\n{text}")

1012 if len(sublists) > 0:

1013 translation = clean_node(wxr, dummy_parent, sublists).strip(

1014 "#*: \n"

1015 )

1016 if translation != "": 1016 ↛ 1019line 1016 didn't jump to line 1019 because the condition on line 1016 was always true

1017 example.translation = translation

1018

1019 for k, v in dummy_parent.items(): 1019 ↛ 1020line 1019 didn't jump to line 1020 because the loop on line 1019 never started

1020 if k == "categories":

1021 parent_sense.categories.extend(v)

1022 dummy_parent = {}

1023

1024 return [], [example], [], [], []

1025

1026 found_gloss = parse_gloss(wxr, parent_sense, contents)

1027

1028 for sl in sublists:

1029 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 1029 ↛ 1031line 1029 didn't jump to line 1031 because the condition on line 1029 was never true

1030 # Should not happen

1031 wxr.wtp.error(

1032 f"Sublist is not NodeKind.LIST: {sublists=!r}",

1033 sortid="simple/pos/82",

1034 )

1035 continue

1036 (

1037 senses,

1038 examples,

1039 related,

1040 synonyms,

1041 antonyms,

1042 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), sl)

1043 ret_senses.extend(senses)

1044 ret_examples.extend(examples)

1045 ret_related.extend(related)

1046 ret_synonyms.extend(synonyms)

1047 ret_antonyms.extend(antonyms)

1048 if len(ret_senses) > 0:

1049 # the recursion returned actual senses from below, so we will

1050 # ignore everything else (incl. any example data that might have

1051 # been given to parent_sense) and return that instead.

1052 # XXX if this becomes relevant, add the example data to a returned

1053 # subsense instead?

1054 # if any(

1055 # isinstance(r, Sense) and r.tags == ["no-gloss"] for r in ret

1056 # ):

1057 # print(f"{ret=}")

1058 return (

1059 combine_senses_with_identical_glosses(ret_senses),

1060 [],

1061 [],

1062 [],

1063 [],

1064 )

1065

1066 # If nothing came from below, then this.

1067 if found_gloss is True or "no-gloss" in parent_sense.tags: 1067 ↛ 1075line 1067 didn't jump to line 1075 because the condition on line 1067 was always true

1068 parent_sense.examples.extend(ret_examples)

1069 parent_sense.related.extend(ret_related)

1070 parent_sense.synonyms.extend(ret_synonyms)

1071 parent_sense.antonyms.extend(ret_antonyms)

1072

1073 return [parent_sense], [], [], [], []

1074

1075 return [], ret_examples, ret_related, ret_synonyms, ret_antonyms

1076

1077

1078def recurse_glosses(

1079 wxr: WiktextractContext, node: WikiNode, data: WordEntry

1080) -> list[Sense]:

1081 """Recurse through WikiNodes to find glosses and sense-related data."""

1082 base_sense = Sense()

1083 ret: list[Sense] = []

1084

1085 senses, examples, related, synonyms, antonyms = recurse_glosses1(

1086 wxr, base_sense, node

1087 )

1088 if ( 1088 ↛ 1094line 1088 didn't jump to line 1094 because the condition on line 1088 was never true

1089 len(examples) > 0

1090 or len(related) > 0

1091 or len(synonyms) > 0

1092 or len(antonyms) > 0

1093 ):

1094 wxr.wtp.error(

1095 "NOT Sense has bubbled to recurse_glosses: "

1096 f"{examples=}, {related=}, {synonyms=}, {antonyms=}",

1097 sortid="pos/glosses/966",

1098 )

1099 for sense in senses:

1100 convert_tags_in_sense(sense)

1101 ret.append(sense)

1102

1103 return ret

1104

1105

1106def split_nodes_to_lines(

1107 nodes: list[WikiNode | str],

1108) -> Iterator[list[WikiNode | str]]:

1109 """Take a list of nodes and split up the list into lines.

1110 This could be done by using node_to_wikitext() to reverse the parsing,

1111 and then you could parse the individual lines after splitting the text,

1112 but it seems unnecessary in the context of Greek Wiktionary PoS sections.

1113 """

1114 parts: list[WikiNode | str] = []

1115 for node in nodes:

1116 if isinstance(node, WikiNode):

1117 # Lists are returned as whole, they're their own line

1118 if node.kind == NodeKind.LIST:

1119 if len(parts) > 0: 1119 ↛ 1120line 1119 didn't jump to line 1120 because the condition on line 1119 was never true

1120 yield parts

1121 parts = []

1122 yield [node]

1123 continue

1124 if isinstance(node, TemplateNode) and node.template_name in ( 1124 ↛ 1131line 1124 didn't jump to line 1131 because the condition on line 1124 was never true

1125 # Ignore specific templates, like {{((}} that bookends a column.

1126 "((",

1127 "))",

1128 "clear",

1129 "κλείδα-ελλ",

1130 ):

1131 continue

1132 parts.append(node)

1133 else:

1134 if "\n" in node:

1135 split_string = node.splitlines()

1136 for spl in split_string[:-1]:

1137 if spl: 1137 ↛ 1138line 1137 didn't jump to line 1138 because the condition on line 1137 was never true

1138 parts.append(spl)

1139 yield parts

1140 parts = []

1141 # special handling for final newline; splitlines ignores it

1142 if node.endswith("\n"):

1143 if split_string[-1]:

1144 parts.append(split_string[-1])

1145 yield parts

1146 parts = []

1147 elif split_string[-1]: 1147 ↛ 1115line 1147 didn't jump to line 1115 because the condition on line 1147 was always true

1148 parts.append(split_string[-1])

1149 elif node: 1149 ↛ 1115line 1149 didn't jump to line 1115 because the condition on line 1149 was always true

1150 parts.append(node)

1151

1152 # yield final parts

1153 if len(parts) > 0: 1153 ↛ 1154line 1153 didn't jump to line 1154 because the condition on line 1153 was never true

1154 yield parts

1155

1156

1157BOLD_RE = re.compile(r"(__/?[BI]__|, |\. )")

1158

1159

1160def extract_forms_and_tags(tagged_text: str) -> tuple[list[str], list[str]]:

1161 forms: list[str] = []

1162 tags: list[str] = []

1163

1164 # print(f"{tagged_text=}")

1165 # inside_italics = False

1166 inside_bold = False

1167

1168 for i, t in enumerate(BOLD_RE.split(tagged_text)):

1169 t = t.strip()

1170 # print(f"{i}: {t=}")

1171 if not t:

1172 continue

1173

1174 if i % 2 == 0:

1175 # Text between splitters

1176 if inside_bold is True: 1176 ↛ 1177line 1176 didn't jump to line 1177 because the condition on line 1176 was never true

1177 forms.append(t)

1178 continue

1179 # Add everything else to raw_tags

1180 # if inside_italics is True:

1181 # tags.append(t)

1182 # continue

1183 # ". " and ", " just split. They're stripped to "." and "," if

1184 # this needs to be modified later.

1185 tags.append(t)

1186 continue

1187 match t:

1188 case "__B__": 1188 ↛ 1189line 1188 didn't jump to line 1189 because the pattern on line 1188 never matched

1189 inside_bold = True

1190 case "__/B__": 1190 ↛ 1191line 1190 didn't jump to line 1191 because the pattern on line 1190 never matched

1191 inside_bold = False

1192 # case "__I__":

1193 # inside_italics = True

1194 # case "__/I__":

1195 # inside_italics = False

1196

1197 return forms, tags

1198

1199

1200META_RE = re.compile(r"__/?[ILEB]__")

1201

1202

1203def block_has_non_greek_text(text: str) -> bool:

1204 text = META_RE.sub("", text)

1205 for t in text.split():

1206 for ch in t: 1206 ↛ 1205line 1206 didn't jump to line 1205 because the loop on line 1206 didn't complete

1207 if not ch.isalpha(): 1207 ↛ 1208line 1207 didn't jump to line 1208 because the condition on line 1207 was never true

1208 continue

1209 if not unicode_name(ch).startswith("GREEK"):

1210 return True

1211 break

1212 return False

1213

1214

1215def combine_senses_with_identical_glosses(

1216 orig_senses: list[Sense],

1217) -> list[Sense]:

1218 glosses_to_senses: dict[tuple[str, ...], list[Sense]] = {}

1219 senses: list[Sense] = []

1220

1221 found_identical_glosses = False

1222

1223 for item in orig_senses:

1224 glosses_key = tuple(item.glosses)

1225 if glosses_key not in glosses_to_senses: 1225 ↛ 1228line 1225 didn't jump to line 1228 because the condition on line 1225 was always true

1226 glosses_to_senses[glosses_key] = [item]

1227 else:

1228 glosses_to_senses[glosses_key].append(item)

1229 found_identical_glosses = True

1230

1231 if not found_identical_glosses: 1231 ↛ 1234line 1231 didn't jump to line 1234 because the condition on line 1231 was always true

1232 return orig_senses

1233

1234 for twinned_senses in glosses_to_senses.values():

1235 main_sense = twinned_senses[0]

1236 for other_sense in twinned_senses[1:]:

1237 main_sense.merge(other_sense)

1238 senses.append(main_sense)

1239

1240 return senses