Coverage for src/wiktextract/extractor/el/pos.py: 80%

452 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from collections.abc import Iterator 

3from functools import partial 

4from typing import TypeAlias 

5from unicodedata import name as unicode_name 

6 

7from wikitextprocessor import ( 

8 HTMLNode, 

9 NodeKind, 

10 TemplateArgs, 

11 TemplateNode, 

12 WikiNode, 

13) 

14from wikitextprocessor.parser import LEVEL_KIND_FLAGS 

15 

16from wiktextract import WiktextractContext 

17from wiktextract.page import clean_node 

18from wiktextract.wxr_logging import logger 

19 

20from .head import parse_head 

21from .linkages import process_linkage_section 

22from .models import Example, FormOf, Linkage, Sense, TemplateData, WordEntry 

23from .parse_utils import ( 

24 GREEK_LANGCODES, 

25 Heading, 

26 parse_lower_heading, 

27 remove_duplicate_forms, 

28) 

29from .section_titles import POS_HEADINGS 

30from .table import parse_table, process_inflection_section 

31from .tags_utils import convert_tags_in_sense 

32from .text_utils import ( 

33 ENDING_NUMBER_RE, 

34 normalized_int, 

35) 

36from .translations import process_translations 

37 

38# from wiktextract.wxr_logging import logger 

39 

40 

41def process_pos( 

42 wxr: WiktextractContext, 

43 node: WikiNode, 

44 data: WordEntry, 

45 prev_data: WordEntry | None, # data from the last entry in this language 

46 # the "noun" in "Noun 2" 

47 pos: str, 

48 title: str, 

49 # the "2" in "Noun 2" 

50 pos_tags: list[str], 

51 pos_num: int = -1, 

52) -> WordEntry | None: 

53 """Process a part-of-speech section, like 'Noun'. `data` provides basic 

54 data common with other POS sections, like pronunciation or etymology.""" 

55 

56 # Metadata for different part-of-speech kinds. 

57 # print(f"{pos_title=}, {pos_tags=}, {pos_num=}") 

58 data.pos = pos # the internal/translated name for the POS 

59 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings. 

60 

61 wxr.wtp.start_subsection(title) 

62 

63 # Sound data associated with this POS might be coming from a shared 

64 # section, in which case we've tried to tag the sound data with its 

65 # pos name + number if possible. Filter out stuff that doesn't fit. 

66 # This is actually pretty common, but if the edition has proper hierarchies 

67 # for this, doing this step might be unnecessary. 

68 new_sounds = [] 

69 for sound in data.sounds: 69 ↛ 70line 69 didn't jump to line 70 because the loop on line 69 never started

70 if len(sound.poses) == 0: 

71 # This sound data wasn't tagged with any specific pos section(s), so 

72 # we add it to everything; this is basically the default behavior. 

73 new_sounds.append(sound) 

74 else: 

75 for sound_pos in sound.poses: 

76 m = ENDING_NUMBER_RE.search(sound_pos) 

77 if m is not None: 

78 s_num = normalized_int(m.group(1).strip()) 

79 s_pos = sound_pos[: m.start()].strip().lower() 

80 else: 

81 s_pos = sound_pos.strip().lower() 

82 s_num = -1 

83 sound_meta = POS_HEADINGS[s_pos] 

84 s_pos = sound_meta["pos"] 

85 if s_pos == data.pos and s_num == data.pos_num: 

86 new_sounds.append(sound) 

87 data.sounds = new_sounds 

88 

89 # Get child nodes *except* headings (= LEVEL). 

90 pos_contents = list( 

91 node.invert_find_child(LEVEL_KIND_FLAGS, include_empty_str=True) 

92 # include empty string only for debug printing? 

93 ) 

94 

95 if len(pos_contents) == 0 or ( 95 ↛ 102line 95 didn't jump to line 102 because the condition on line 95 was never true

96 len(pos_contents) == 1 

97 and isinstance(pos_contents[0], str) 

98 # Just a single newline or whitespace after heading. 

99 and not pos_contents[0].strip() 

100 ): 

101 # Most probably a bad article. 

102 wxr.wtp.error( 

103 "No body for Part-of-speech section.", sortid="simple/pos/271" 

104 ) 

105 data.senses.append(Sense(tags=["no-gloss"])) 

106 return data 

107 

108 # split_nodes_to_lines returns lists items on their own 'line' 

109 node_lines = list(split_nodes_to_lines(pos_contents)) 

110 

111 glosses_index = None 

112 glosses_lists = [] 

113 for i, line in enumerate(node_lines): 

114 # Looking at the "rump" after glosses lists starts, it's simplest 

115 # just to pull all the list nodes, and handle them. Anything after 

116 # or inbetween (like categories, extra templates, tables and images) 

117 # can be ignored. 

118 if ( 

119 len(line) == 1 

120 and isinstance(line[0], WikiNode) 

121 and line[0].kind == NodeKind.LIST 

122 and (line[0].sarg != ":") 

123 ): 

124 if glosses_index is None: 124 ↛ 126line 124 didn't jump to line 126 because the condition on line 124 was always true

125 glosses_index = i 

126 glosses_lists.append(line[0]) 

127 

128 if glosses_index is None: 

129 # if nothing found, accept ":" nodes 

130 for i, line in enumerate(node_lines): 

131 if ( 

132 len(line) == 1 

133 and isinstance(line[0], WikiNode) 

134 and line[0].kind == NodeKind.LIST 

135 ): 

136 if glosses_index is None: 136 ↛ 138line 136 didn't jump to line 138 because the condition on line 136 was always true

137 glosses_index = i 

138 glosses_lists.append(line[0]) 

139 

140 if glosses_index is None: 

141 # Could not find any glosses. 

142 # logger.info(f" //// {wxr.wtp.title}\n MISSING GLOSSES") 

143 wxr.wtp.warning("Missing glosses", sortid="pos/20250121") 

144 data.tags.append("no-gloss") 

145 

146 template_data: list[TemplateData] = [] 

147 category_data: list[str] = [] 

148 table_nodes: list[tuple[str | None, WikiNode]] = [] 

149 # template_depth is used as a nonlocal variable in bold_node_handler 

150 # to gauge how deep inside a top-level template we are; we want to 

151 # collect template data only for the top-level templates that are 

152 # visible in the wikitext, not templates inside templates. 

153 template_depth = 0 

154 top_template_name: str | None = None 

155 

156 def bold_node_handler_fn( 

157 node: WikiNode, 

158 ) -> list[str | WikiNode] | None: 

159 """Insert special markers `__*S__` and `__*E__` around bold nodes so 

160 that the strings can later be split into "head-word" and "tag-words" 

161 parts. Collect incidental stuff, like side-tables, that are often 

162 put around the head.""" 

163 assert isinstance(node, WikiNode) 

164 kind = node.kind 

165 nonlocal template_depth 

166 nonlocal top_template_name 

167 if kind == NodeKind.BOLD or ( 

168 isinstance(node, HTMLNode) 

169 and ( 

170 node.tag == "span" 

171 and "style" in node.attrs 

172 and ( 

173 "bold" in node.attrs["style"] 

174 # Special handling for output for stuff in arabic script 

175 or node.attrs["style"] == "color:black; font-size:200%;" 

176 ) 

177 or node.tag == "b" 

178 or node.tag == "strong" 

179 ) 

180 ): 

181 # These are word forms almost always 

182 return ["__B__", *node.children, "__/B__"] 

183 elif kind == NodeKind.ITALIC or ( 

184 isinstance(node, HTMLNode) 

185 and ( 

186 node.tag == "span" 

187 and "style" in node.attrs 

188 and ( 

189 "italic" in node.attrs["style"] 

190 ) 

191 or node.tag == "i" 

192 or node.tag == "em" 

193 ) 

194 ): 

195 # These are almost always tag words; often 'kai' isn't italicized, 

196 # for example. 

197 return ["__I__", *node.children, "__/I__"] 

198 elif isinstance(node, TemplateNode): 

199 # Recursively expand templates so that even nodes inside the 

200 # the templates are handled with bold_node_handler. 

201 # Argh. Don't use "node_to_text", that causes bad output... 

202 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node)) 

203 if template_depth == 0: 203 ↛ 217line 203 didn't jump to line 217 because the condition on line 203 was always true

204 # We are looking at a top-level template in the original 

205 # wikitext. 

206 template_data.append( 

207 TemplateData( 

208 name=node.template_name, 

209 args={ 

210 str(k): clean_node(wxr, None, v) 

211 for k, v in node.template_parameters.items() 

212 }, 

213 expansion=expanded, 

214 ) 

215 ) 

216 top_template_name = node.template_name 

217 new_node = wxr.wtp.parse(expanded) 

218 

219 template_depth += 1 

220 ret = wxr.wtp.node_to_text( 

221 new_node, node_handler_fn=bold_node_handler_fn 

222 ) 

223 template_depth -= 1 

224 if template_depth == 0: 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true

225 top_template_name = None 

226 return ret 

227 elif kind == NodeKind.LINK: 

228 if not isinstance(node.largs[0][0], str): 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true

229 return None 

230 if node.largs[0][0].startswith("Κατηγορία:"): 

231 category_data.append(node.largs[0][0][len("Κατηγορία:") :]) 

232 return [""] 

233 # Special case for meta-links like Πρότυπο:ετ that generate 

234 # both a category link and :category link that is actually 

235 # displayed as a link, but for our purposes we want to ignore 

236 # that it is a link; it's a tag. 

237 if node.largs[0][0].startswith(":Κατηγορία:"): 

238 # unpacking a list-comprehension, unpacking into a list 

239 # seems to be more performant than adding lists together. 

240 return [ 

241 wxr.wtp.node_to_text( 

242 node.largs[1:2] or node.largs[0], 

243 node_handler_fn=bold_node_handler_fn, 

244 ) 

245 # output the "visible" half of the link. 

246 ] 

247 if node.largs[0][0].startswith("Αρχείο:"): 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true

248 return [""] 

249 # Often forms are 'formatted' with links, so let's mark these 

250 # too. 

251 return [ 

252 "__L__", 

253 wxr.wtp.node_to_text( 

254 node.largs[1:2] or node.largs[0], 

255 node_handler_fn=bold_node_handler_fn, 

256 ), 

257 # output the "visible" half of the link. 

258 # XXX collect link data if it turns out to be important. 

259 "__/L__", 

260 ] 

261 # print(f"{node.largs=}") 

262 

263 elif kind in { 263 ↛ 269line 263 didn't jump to line 269 because the condition on line 263 was never true

264 NodeKind.TABLE, 

265 }: 

266 # XXX Handle tables here 

267 # template depth and top-level template name 

268 nonlocal table_nodes 

269 table_nodes.append((top_template_name, node)) 

270 return [""] 

271 return None 

272 

273 # Get Head Line 

274 # Head *should* be immediately before the glosses... 

275 # print(node_lines[:glosses_index]) 

276 found_head = False 

277 

278 for line in reversed(node_lines[:glosses_index]): 

279 template_data = [] 

280 template_depth = 0 

281 stripped = ( 

282 wxr.wtp.node_to_text(line, node_handler_fn=bold_node_handler_fn) 

283 .removeprefix(":") 

284 .strip() 

285 ) 

286 if not stripped: 

287 continue 

288 if not found_head and parse_head(wxr, data, stripped): 288 ↛ 278line 288 didn't jump to line 278 because the condition on line 288 was always true

289 # print(data) 

290 found_head = True 

291 if not found_head: 291 ↛ 297line 291 didn't jump to line 297 because the condition on line 291 was never true

292 # There are a bunch of Greek Wiktionary articles with POS sections 

293 # without heads, but they seem to always follow ones with heads; 

294 # in this case, the result is just not including any `forms` field 

295 # for these (or copying the previous one). 

296 

297 if prev_data is None: 

298 wxr.wtp.warning( 

299 f"Part of speech missing head: {wxr.wtp.title}", 

300 sortid="pos/460/20250104", 

301 ) 

302 else: 

303 # No head found, copy previous (in this language) 

304 data.forms = [ 

305 form.model_copy(deep=True) for form in prev_data.forms 

306 ] 

307 

308 if len(template_data) > 0: 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true

309 data.head_templates = template_data 

310 # logger.info( 

311 # f" //// {wxr.wtp.title}\n >>>" 

312 # + "\n >>>".join(repr(td) for td in template_data) 

313 # ) 

314 

315 if len(table_nodes) > 0: 315 ↛ 316line 315 didn't jump to line 316 because the condition on line 315 was never true

316 for template_name, table_node in table_nodes: 

317 # XXX template_name 

318 parse_table( 

319 wxr, 

320 table_node, 

321 data, 

322 data.lang_code in GREEK_LANGCODES, 

323 template_name=template_name or "", 

324 ) 

325 

326 data.forms = remove_duplicate_forms(wxr, data.forms) 

327 

328 # Ignore images and files 

329 # 2025-01-17 13:10:10,035 INFO: //// Ηρόδοτος 

330 # //// [[Αρχείο:Herodotus Massimo Inv124478.jpg|200px|thumb|[[προτομή]] του Ηροδότου]] 

331 

332 # Have to ignore {{(( specifically. Creates columns. 

333 # 2025-01-17 13:10:11,059 INFO: //// κάνω 

334 # //// {{((|width=97%}} 

335 

336 # logger.info(f"<<<<<<<<< {wxr.wtp.title}\n<<<" + "\n<<<".join(pparts)) 

337 # see: free -> {{en-verb-'free'}} creates a floating inflection table 

338 # followed by the usual head template 

339 

340 # see: τηλεομοιοτυπία 

341 # '''{{PAGENAME}}''' {{θ}} 

342 # theta is basically {{f|...}} 

343 

344 # see: θηλυκός 

345 # '''{{PAGENAME}}, -ή''' ''και'' '''-ιά, -ό''' 

346 # pagename, -e and -ia, -o, no indication of what these mean 

347 

348 # Ιόνια νησιά 

349 # >>>'''{{PAGENAME}}''' ''πληθυντικός του'' [[ιόνιος|Ιόνιο]] [[νησί]] 

350 # plural of 'Ionian island' 

351 

352 # >>>>>>>>> free 

353 # >>>{{en-adj-r}} # floating table 

354 # >>>{{τ|en|{{PAGENAME}}}}, ''συγκριτικός'' '''freer''', ''υπερθετικός'' '''freest''' 

355 # pretty consistent bolding and italics 

356 

357 # genus 

358 # {{τ|la|{{PAGENAME}}}} {{ο}} ([[γενική]]: generis) (3ης [[κλίση]]ς) 

359 

360 # ουδέτερος 

361 # >>>'''{{PAGENAME}} -η -ο''' 

362 

363 # καφέ 

364 # >>>'''{{PAGENAME}}''' {{ακλ|επίθ}} 

365 # aklitos, uninflected 

366 

367 # καφέ 

368 # >>>[[Αρχείο:Cafe Museum - Vienna (5402363918).jpg|μικρογραφία|τραπέζι σε βιενέζικο '''καφέ''']] 

369 # >>>'''{{PAGENAME}}''' {{ο}} {{ακλ|ουδ}} 

370 # Ignore images 

371 

372 # κρόκος 

373 # >>>{| align="right" 

374 # >>> 

375 # >>>|- 

376 # >>> 

377 # >>>|[[Αρχείο:Crocus sativus1.jpg|thumb|150px|Άνθη '''κρόκου''' (''Crocus sativus'').]] 

378 # >>> 

379 # >>> 

380 # >>>|[[Αρχείο:Raw egg.jpg|thumb|200px|Ο '''κρόκος''' ενός αβγού.]] 

381 # >>> 

382 # >>> 

383 # >>>|} 

384 # >>> 

385 # >>>'''{{PAGENAME}}''' {{α}} 

386 

387 # p 

388 # >>>'''{{PAGENAME}}''' ''πεζό'' (''κεφαλαίο:'' '''{{l|P|la}}''') 

389 # lowercase, uppercase 

390 

391 # Δημόκριτος 

392 # >>>'''{{PAGENAME}}''' 

393 # >>># {{όνομα||α}} 

394 # >>>{{clear}} 

395 # Clear is just formatting to move the line down where there are empty 

396 # margins. 

397 

398 # BIG ASSUMPTION: let us assume that Greek Wiktionary doesn't use templates 

399 # that generate multiline text that is part of head. That is, we can see 

400 # each newline because they are in strings, and when something that does 

401 # generate virtual newlines (list) pops up, that's when the head portion 

402 # ends. 

403 # Greek Wiktionary head sections look like this: 

404 # > Pre-head templates that create side-tables, like inflections 

405 # > Possible formatting templates like {{clear}} that should be ignored 

406 # > Head template last before glosses list 

407 # > Clear again... 

408 # > Glosses list tree, where we can stop. 

409 # We can create "lines" of these by looping over the items in pos_content 

410 # and looking for newlines in strings, because that's where they mainly 

411 # should be (except side-table templates). We handle earlier lines 

412 # differently than the last line before the glosses list, which is the 

413 # head. 

414 

415 # return None 

416 

417 # ====================== 

418 

419 ### Glosses after head ### 

420 # parts = [] 

421 got_senses = False 

422 for lst in glosses_lists: 

423 # Wiktionaries handle glosses the usual way: with numbered lists. 

424 # Each list entry is a gloss, sometimes with subglosses, but with 

425 # Simple English Wiktionary that seems rare. 

426 # logger.debug(f"{lst}") 

427 senses = recurse_glosses(wxr, lst, data) 

428 if len(senses) > 0: 428 ↛ 422line 428 didn't jump to line 422 because the condition on line 428 was always true

429 got_senses = True 

430 data.senses.extend(senses) 

431 

432 if not got_senses and len(glosses_lists) > 0: 432 ↛ 433line 432 didn't jump to line 433 because the condition on line 432 was never true

433 wxr.wtp.error( 

434 "POS had a list, but the list did not return senses.", 

435 sortid="simple/pos/313", 

436 ) 

437 

438 # If there is no list, clump everything into one gloss. 

439 # if not len(glosses_lists > 0): 

440 # sense = Sense() 

441 # found_gloss = parse_gloss(wxr, sense, pos_contents[i:]) 

442 # if found_gloss is True or len(sense.raw_tags) > 0: 

443 # convert_tags_in_sense(sense) 

444 # if len(sense.glosses) == 0 and "no-gloss" not in sense.tags: 

445 # sense.tags.append("no-gloss") 

446 # data.senses.append(sense) 

447 

448 if len(data.senses) == 0: 

449 data.senses.append(Sense(tags=["no-gloss"])) 

450 

451 ##### 

452 ##### 

453 # TEMP DEBUG PRINTS 

454 

455 pos_sublevels = list( 

456 node.find_child(LEVEL_KIND_FLAGS) 

457 # include empty string only for debug printing? 

458 ) 

459 

460 for sl in pos_sublevels: 460 ↛ 461line 460 didn't jump to line 461 because the loop on line 460 never started

461 subtitle = clean_node(wxr, None, sl.largs).lower().strip() 

462 

463 type, pos, heading_name, tags, num, ok = parse_lower_heading( 

464 wxr, subtitle 

465 ) 

466 

467 if type == Heading.Translations: 

468 process_translations(wxr, data, sl) 

469 elif type == Heading.Infl: 

470 process_inflection_section(wxr, data, sl) 

471 elif type in ( 

472 Heading.Related, 

473 Heading.Synonyms, 

474 Heading.Antonyms, 

475 Heading.Transliterations, 

476 ): 

477 process_linkage_section(wxr, data, sl, type) 

478 # if type not in ( 

479 # Heading.Translations, 

480 # Heading.Ignored, 

481 # Heading.Infl, 

482 # Heading.Related, 

483 # Heading.Synonyms, 

484 # Heading.Antonyms, 

485 # Heading.Derived, 

486 # # We're going to ignore homonyms because they're 

487 # # only tangentially related, like anagrams 

488 # Heading.Homonyms, 

489 # ): 

490 # # ... 

491 # expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(sl)) 

492 # # text = clean_node(wxr, None, sl) 

493 # logger.warning( 

494 # f""" 

495 # {wxr.wtp.title}: {type}, '{heading_name}', {ok=} 

496 # {expanded} 

497 

498 # ########################### 

499 # """ 

500 # ) 

501 

502 ##### 

503 ##### 

504 return data 

505 

506 

507PARENS_BEFORE_RE = re.compile(r"\s*(\([^()]+\)\s*)+") 

508ITER_PARENS_RE = re.compile(r"\(([^()]+)\)") 

509 

510 

511def bold_node_fn( 

512 node: WikiNode, 

513) -> list[str | WikiNode] | None: 

514 """Handle nodes in the parse tree specially.""" 

515 # print(f"{node=}") 

516 if node.kind == NodeKind.ITALIC: 

517 return ["__I__", *node.children, "__/I__"] 

518 if node.kind == NodeKind.BOLD: 

519 return ["__B__", *node.children, "__/B__"] 

520 # if node.kind == NodeKind.LINK: 

521 # if not isinstance(node.largs[0][0], str): 

522 # return None 

523 # return [ 

524 # "__L__", 

525 # # unpacking a list-comprehension, unpacking into a list 

526 # # seems to be more performant than adding lists together. 

527 # *( 

528 # wxr.wtp.node_to_text( 

529 # node.largs[1:2] or node.largs[0], 

530 # ) 

531 # # output the "visible" half of the link. 

532 # ), 

533 # # XXX collect link data if it turns out to be important. 

534 # "__/L__", 

535 # ] 

536 # # print(f"{node.largs=}") 

537 return None 

538 

539 

540def extract_form_of_templates( 

541 wxr: WiktextractContext, 

542 parent_sense: Sense | WordEntry, 

543 t_node: TemplateNode, 

544 siblings: list[str | WikiNode], 

545 siblings_index: int, 

546) -> None: 

547 """Parse form_of for nouns, adjectives and verbs. 

548 

549 Supports: 

550 * κλ | generic | form_of 

551 * γρ | generic | form_of 

552 * πτώση/πτώσεις | nouns, adjectives etc. | form_of and tags 

553 * ρημ τύπος | verbs | form_of 

554 * μτχ | verbs | form_of 

555 

556 * References: 

557 https://el.wiktionary.org/wiki/Πρότυπο:κλ 

558 https://el.wiktionary.org/wiki/Module:άλλημορφή 

559 https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_κλιτικούς_τύπους 

560 https://el.wiktionary.org/wiki/Πρότυπο:ρημ_τύπος 

561 https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_μετοχές 

562 """ 

563 t_name = t_node.template_name 

564 

565 basic_extract = partial( 

566 extract_form_of_templates_basic, 

567 wxr, 

568 parent_sense, 

569 siblings, 

570 siblings_index, 

571 t_name, 

572 t_node, 

573 ) 

574 # Generic 

575 if t_name == "κλ": 

576 return basic_extract(extract_argument=2) 

577 

578 # Notes: 

579 # * All occurrences in wiktionary have at least one argument 

580 # * Only handle cases where the second argument refers to a form: 

581 # μορφ / μορφή / λόγια μορφή του, etc. 

582 # and ignore those mistakenly used as synonym templates 

583 if t_name == "γρ" and 2 in t_node.template_parameters: 

584 second_arg = t_node.template_parameters[2] 

585 second_arg_str = clean_node(wxr, None, second_arg) 

586 if "μορφ" in second_arg_str: 

587 return basic_extract(extract_argument=1) 

588 

589 # Nouns and adjectives 

590 inflection_t_names = ("πτώσεις", "πτώση") 

591 if any(name in t_name for name in inflection_t_names): 

592 return extract_form_of_templates_ptosi(wxr, parent_sense, t_node) 

593 

594 # Verbs 

595 if t_name == "ρημ τύπος": 

596 return basic_extract(extract_argument=2) 

597 

598 if t_name.startswith("μτχ"): 

599 return basic_extract(extract_argument=1) 

600 

601 

602def extract_form_of_templates_basic( 

603 wxr: WiktextractContext, 

604 parent_sense: Sense | WordEntry, 

605 siblings: list[str | WikiNode], 

606 sibling_index: int, 

607 t_name: str, 

608 t_node: TemplateNode, 

609 extract_argument: int | str, 

610): 

611 t_args = t_node.template_parameters 

612 if extract_argument not in t_args: 

613 # mtxpp template has no args, consume the next links for the 

614 # form_of field 

615 wxr.wtp.warning( 

616 f"Form-of template does not have lemma data: {t_name}, {t_args=}", 

617 sortid="pos/570/20250517", 

618 ) 

619 links: list[str | WikiNode] = [] 

620 for node in siblings[sibling_index + 1 :]: 

621 if not ( 

622 (isinstance(node, str) and node.strip() == "") 

623 or (isinstance(node, WikiNode) and node.kind == NodeKind.LINK) 

624 ): 

625 break 

626 links.append(node) 

627 lemma = clean_node(wxr, None, links).strip() 

628 else: 

629 lemma = clean_node(wxr, None, t_args[extract_argument]).strip() 

630 

631 if lemma: 

632 form_of = FormOf(word=lemma) 

633 parent_sense.form_of.append(form_of) 

634 else: 

635 wxr.wtp.warning( 

636 "Lemma extract from form-of template was empty or whitespace:" 

637 f"{t_name}, {t_args=}, {lemma=}", 

638 sortid="pos/609/20250925", 

639 ) 

640 

641 

642def extract_form_of_templates_ptosi( 

643 wxr: WiktextractContext, 

644 parent_sense: Sense | WordEntry, 

645 t_node: TemplateNode, 

646) -> None: 

647 """Parse form_of for nouns and adjectives. 

648 

649 Supports: 

650 * [gender του] πτώση-πτώσεις templates 

651 

652 Notes: 

653 * πτώση has exactly one case, πτώσεις as at least two cases 

654 """ 

655 t_name = t_node.template_name 

656 inflection_t_names = ("πτώσεις", "πτώση") 

657 tags: list[str] = [] 

658 

659 # Parse and consume gender if any 

660 if "-" in t_name: 

661 # Cf. {{ουδ του-πτώσειςΟΑΚεν|καλός|grc}} 

662 gender, inflection = t_name.split("-") 

663 code = gender[:3] 

664 GENDER_INFLECTION_MAP = { 

665 "θηλ": "feminine", 

666 "αρσ": "masculine", 

667 "ουδ": "neuter", 

668 } 

669 try: 

670 gender_tag = GENDER_INFLECTION_MAP[code] 

671 except KeyError: 

672 # Bad template name. 

673 return 

674 tags.append(gender_tag) 

675 else: 

676 inflection = t_name 

677 

678 # Remove πτώση-πτώσεις prefix 

679 for prefix in inflection_t_names: 679 ↛ 684line 679 didn't jump to line 684 because the loop on line 679 didn't complete

680 if inflection.startswith(prefix): 

681 inflection = inflection[len(prefix) :] 

682 break 

683 

684 PTOSI_INFLECTION_MAP = { 

685 "Ο": "nominative", 

686 "Α": "accusative", 

687 "Γ": "genitive", 

688 "Κ": "vocative", 

689 } 

690 

691 # The πτώση-πτώσεις templates contains: 

692 # * Case(s) (1 for πτώση, >1 for πτώσεις) in uppercase characters. 

693 # * Number in either "εν" (singular) or "πλ" (plural) 

694 # 

695 # Examples: 

696 # * {{πτώσηΑεν|κόρφος}} > accusative | singular 

697 # * {{πτώσειςΟΚπλ|κόρφος}} > nominative, vocative | plural 

698 try: 

699 lowercase = "".join(ch for ch in inflection if ch.islower()) 

700 number = {"εν": "singular", "πλ": "plural"}[lowercase] 

701 uppercase = [ch for ch in inflection if not ch.islower()] 

702 cases = [PTOSI_INFLECTION_MAP[ch] for ch in uppercase] 

703 except KeyError: 

704 # Bad template name. 

705 return 

706 

707 tags.extend([elt for elt in cases + [number]]) 

708 

709 t_args = t_node.template_parameters 

710 

711 if 1 not in t_args: 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true

712 wxr.wtp.warning( 

713 f"Form-of template does not have lemma data: {t_name}, {t_args=}", 

714 sortid="pos/620/20250416", 

715 ) 

716 return 

717 

718 lemma = clean_node(wxr, None, t_args[1]) 

719 form_of = FormOf(word=lemma) 

720 parent_sense.form_of.append(form_of) 

721 tags.sort() # For the tests, but also good practice 

722 parent_sense.tags.extend(tags) 

723 

724 

725def parse_gloss( 

726 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode] 

727) -> bool: 

728 """Take what is preferably a line of text and extract tags and a gloss from 

729 it. The data is inserted into parent_sense, and for recursion purposes 

730 we return a boolean that tells whether there was any gloss text in a 

731 lower node.""" 

732 if len(contents) == 0: 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true

733 return False 

734 

735 for i, t_node in enumerate(contents): 

736 if isinstance(t_node, TemplateNode): 

737 extract_form_of_templates(wxr, parent_sense, t_node, contents, i) 

738 

739 template_tags: list[str] = [] 

740 

741 bl_linkages: list[Linkage] = [] 

742 no_gloss_but_keep_anyway = False 

743 

744 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None: 

745 nonlocal bl_linkages 

746 if name == "βλ": 

747 for k, v in ht.items(): 

748 if isinstance(k, int): 

749 bl_linkages.append(Linkage(word=clean_node(wxr, None, v))) 

750 return "" 

751 return None 

752 

753 # The rest of the text. 

754 text = clean_node( 

755 wxr, 

756 parent_sense, 

757 contents, 

758 template_fn=bl_template_handler_fn, 

759 node_handler_fn=bold_node_fn, 

760 ) 

761 

762 if len(bl_linkages) > 0: 

763 parent_sense.related.extend(bl_linkages) 

764 no_gloss_but_keep_anyway = True 

765 

766 if not text.strip(): 

767 if len(bl_linkages) <= 0: 767 ↛ 768line 767 didn't jump to line 768 because the condition on line 767 was never true

768 return False 

769 

770 # print(f" ============ {contents=}, {text=}") 

771 

772 # Greek Wiktionary uses a lot of template-less tags. 

773 if parens_n := PARENS_BEFORE_RE.match(text): 

774 blocks = ITER_PARENS_RE.findall(parens_n.group(0)) 

775 # print(f"{blocks=}") 

776 kept_blocks: list[str] = [] 

777 forms: list[str] = [] 

778 raw_tag_texts: list[str] = [] 

779 for block in blocks: 

780 if block_has_non_greek_text(block): 

781 # Keep parentheses with non-greek text with gloss text) 

782 kept_blocks.extend(("(", block, ") ")) 

783 continue 

784 nforms, nraw_tag_texts = extract_forms_and_tags(block) 

785 forms.extend(nforms) 

786 raw_tag_texts.extend(nraw_tag_texts) 

787 # print(f"{forms=}, {raw_tag_texts=}") 

788 if forms: 788 ↛ 790line 788 didn't jump to line 790 because the condition on line 788 was never true

789 # print(f"{forms=}") 

790 parent_sense.related.extend(Linkage(word=form) for form in forms) 

791 parent_sense.raw_tags.extend(raw_tag_texts) 

792 kept_blocks.append(text[parens_n.end() :]) 

793 text = "".join(kept_blocks) 

794 

795 text = re.sub(r"__/?[IB]__", "", text) 

796 

797 if len(template_tags) > 0: 797 ↛ 798line 797 didn't jump to line 798 because the condition on line 797 was never true

798 parent_sense.raw_tags.extend(template_tags) 

799 

800 if len(text) > 0: 

801 parent_sense.glosses.append(text) 

802 return True 

803 

804 if no_gloss_but_keep_anyway: 804 ↛ 808line 804 didn't jump to line 808 because the condition on line 804 was always true

805 parent_sense.raw_tags.append("no-gloss") 

806 return True 

807 

808 return False 

809 

810 

811Related: TypeAlias = Linkage 

812Synonym: TypeAlias = Linkage 

813Antonym: TypeAlias = Linkage 

814 

815 

816def recurse_glosses1( 

817 wxr: WiktextractContext, 

818 parent_sense: Sense, 

819 node: WikiNode, 

820) -> tuple[ 

821 list[Sense], 

822 list[Example], 

823 list[Related], 

824 list[Synonym], 

825 list[Antonym], 

826]: 

827 """Helper function for recurse_glosses""" 

828 # print(f"{node=}") 

829 

830 ret_senses: list[Sense] = [] 

831 ret_examples: list[Example] = [] 

832 ret_related: list[Related] = [] 

833 ret_synonyms: list[Synonym] = [] 

834 ret_antonyms: list[Antonym] = [] 

835 found_gloss = False 

836 

837 # Pydantic stuff doesn't play nice with Tatu's manual dict manipulation 

838 # functions, so we'll use a dummy dict here that we then check for 

839 # content and apply to `parent_sense`. 

840 dummy_parent: dict = {} 

841 

842 related_linkages: list[Linkage] = [] 

843 example_is_synonym = False 

844 example_is_antonym = False 

845 

846 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None: 

847 nonlocal related_linkages 

848 nonlocal example_is_synonym 

849 nonlocal example_is_antonym 

850 # Sometimes the bl-templates point to synonyms or antonyms, instead 

851 # of just "related"; we save them, and if example_is_xxxnym is true, 

852 # we later return them as xxxnyms. 

853 if name == "βλ": 

854 for k, v in ht.items(): 

855 if isinstance(k, int): 

856 related_linkages.append( 

857 Linkage(word=clean_node(wxr, None, v)) 

858 ) 

859 return "" 

860 if name == "συνων": 

861 example_is_synonym = True 

862 return "" 

863 if name == "αντων": 

864 example_is_antonym = True 

865 return "" 

866 return None 

867 

868 # List nodes contain only LIST_ITEM nodes, which may contain sub-LIST nodes. 

869 if node.kind == NodeKind.LIST: 

870 list_ret: tuple[ 

871 list[Sense], 

872 list[Example], 

873 list[Related], 

874 list[Synonym], 

875 list[Antonym], 

876 ] = ([], [], [], [], []) 

877 for child in node.children: 

878 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 878 ↛ 880line 878 didn't jump to line 880 because the condition on line 878 was never true

879 # This should never happen 

880 wxr.wtp.error( 

881 f"{child=} is direct child of NodeKind.LIST", 

882 sortid="simple/pos/44", 

883 ) 

884 continue 

885 ( 

886 senses, 

887 examples, 

888 related, 

889 synonyms, 

890 antonyms, 

891 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child) 

892 list_ret[0].extend(senses) 

893 list_ret[1].extend(examples) 

894 list_ret[2].extend(related) 

895 list_ret[3].extend(synonyms) 

896 list_ret[4].extend(antonyms) 

897 return list_ret 

898 

899 elif node.kind == NodeKind.LIST_ITEM: 899 ↛ 1001line 899 didn't jump to line 1001 because the condition on line 899 was always true

900 # Split at first LIST node found 

901 split_at = next( 

902 ( 

903 i 

904 for i, c in enumerate(node.children) 

905 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST 

906 ), 

907 len(node.children), 

908 ) 

909 contents = node.children[:split_at] 

910 sublists = node.children[split_at:] 

911 

912 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like 

913 # `#` or `##:`: the token that appears at the very start of a line that 

914 # is used to parse the depth and structure of lists. 

915 # `#` Item 1 

916 # `##` Item 1.1 

917 # `##*` Example 1.1 

918 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"): 

919 # This is either a quotation or example. 

920 text = clean_node( 

921 wxr, dummy_parent, contents, template_fn=bl_template_handler_fn 

922 ).strip("⮡ \n") 

923 

924 # print(f"{contents=}, {text=}, {related_linkages=}") 

925 

926 if example_is_synonym or example_is_antonym: 

927 link_linkages = [] 

928 for snode in contents: 

929 if not isinstance(snode, WikiNode): 

930 continue 

931 if snode.kind == NodeKind.LINK: 

932 link_linkages.append( 

933 Linkage( 

934 word=clean_node(wxr, None, snode.largs[0][0]) 

935 ) 

936 ) 

937 else: 

938 for link in snode.find_child_recursively(NodeKind.LINK): 938 ↛ 939line 938 didn't jump to line 939 because the loop on line 938 never started

939 link_linkages.append( 

940 Linkage(word=clean_node(wxr, None, link)) 

941 ) 

942 

943 # print("=====") 

944 # print(f"{link_linkages=}") 

945 

946 if example_is_synonym: 

947 return [], [], [], link_linkages + related_linkages, [] 

948 elif example_is_antonym: 948 ↛ 951line 948 didn't jump to line 951 because the condition on line 948 was always true

949 return [], [], [], [], link_linkages + related_linkages 

950 

951 if len(related_linkages) > 0: 

952 # parent_sense.related.extend(bl_linkages) 

953 # related_linkages = [] 

954 # if not text.strip(): 

955 return [], [], related_linkages, [], [] 

956 

957 example_is_synonym = False 

958 example_is_antonym = False 

959 

960 if not text.strip(): 960 ↛ 961line 960 didn't jump to line 961 because the condition on line 960 was never true

961 return [], [], [], [], [] 

962 

963 example = Example(text=text) 

964 # logger.debug(f"{wxr.wtp.title}/example\n{text}") 

965 if len(sublists) > 0: 

966 translation = clean_node(wxr, dummy_parent, sublists).strip( 

967 "#*: \n" 

968 ) 

969 if translation != "": 969 ↛ 972line 969 didn't jump to line 972 because the condition on line 969 was always true

970 example.translation = translation 

971 

972 for k, v in dummy_parent.items(): 972 ↛ 973line 972 didn't jump to line 973 because the loop on line 972 never started

973 if k == "categories": 

974 parent_sense.categories.extend(v) 

975 dummy_parent = {} 

976 

977 return [], [example], [], [], [] 

978 

979 found_gloss = parse_gloss(wxr, parent_sense, contents) 

980 

981 for sl in sublists: 

982 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 982 ↛ 984line 982 didn't jump to line 984 because the condition on line 982 was never true

983 # Should not happen 

984 wxr.wtp.error( 

985 f"Sublist is not NodeKind.LIST: {sublists=!r}", 

986 sortid="simple/pos/82", 

987 ) 

988 continue 

989 ( 

990 senses, 

991 examples, 

992 related, 

993 synonyms, 

994 antonyms, 

995 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), sl) 

996 ret_senses.extend(senses) 

997 ret_examples.extend(examples) 

998 ret_related.extend(related) 

999 ret_synonyms.extend(synonyms) 

1000 ret_antonyms.extend(antonyms) 

1001 if len(ret_senses) > 0: 

1002 # the recursion returned actual senses from below, so we will 

1003 # ignore everything else (incl. any example data that might have 

1004 # been given to parent_sense) and return that instead. 

1005 # XXX if this becomes relevant, add the example data to a returned 

1006 # subsense instead? 

1007 # if any( 

1008 # isinstance(r, Sense) and r.raw_tags == ["no-gloss"] for r in ret 

1009 # ): 

1010 # print(f"{ret=}") 

1011 return ( 

1012 combine_senses_with_identical_glosses(ret_senses), 

1013 [], 

1014 [], 

1015 [], 

1016 [], 

1017 ) 

1018 

1019 # If nothing came from below, then this. 

1020 if found_gloss is True or "no-gloss" in parent_sense.raw_tags: 1020 ↛ 1028line 1020 didn't jump to line 1028 because the condition on line 1020 was always true

1021 parent_sense.examples.extend(ret_examples) 

1022 parent_sense.related.extend(ret_related) 

1023 parent_sense.synonyms.extend(ret_synonyms) 

1024 parent_sense.antonyms.extend(ret_antonyms) 

1025 

1026 return [parent_sense], [], [], [], [] 

1027 

1028 return [], ret_examples, ret_related, ret_synonyms, ret_antonyms 

1029 

1030 

1031def recurse_glosses( 

1032 wxr: WiktextractContext, node: WikiNode, data: WordEntry 

1033) -> list[Sense]: 

1034 """Recurse through WikiNodes to find glosses and sense-related data.""" 

1035 base_sense = Sense() 

1036 ret: list[Sense] = [] 

1037 

1038 senses, examples, related, synonyms, antonyms = recurse_glosses1( 

1039 wxr, base_sense, node 

1040 ) 

1041 if ( 1041 ↛ 1047line 1041 didn't jump to line 1047 because the condition on line 1041 was never true

1042 len(examples) > 0 

1043 or len(related) > 0 

1044 or len(synonyms) > 0 

1045 or len(antonyms) > 0 

1046 ): 

1047 wxr.wtp.error( 

1048 "NOT Sense has bubbled to recurse_glosses: " 

1049 f"{examples=}, {related=}, {synonyms=}, {antonyms=}", 

1050 sortid="pos/glosses/966", 

1051 ) 

1052 for sense in senses: 

1053 convert_tags_in_sense(sense) 

1054 ret.append(sense) 

1055 

1056 return ret 

1057 

1058 

1059def split_nodes_to_lines( 

1060 nodes: list[WikiNode | str], 

1061) -> Iterator[list[WikiNode | str]]: 

1062 """Take a list of nodes and split up the list into lines. 

1063 This could be done by using node_to_wikitext() to reverse the parsing, 

1064 and then you could parse the individual lines after splitting the text, 

1065 but it seems unnecessary in the context of Greek Wiktionary PoS sections. 

1066 """ 

1067 parts: list[WikiNode | str] = [] 

1068 for node in nodes: 

1069 if isinstance(node, WikiNode): 

1070 # Lists are returned as whole, they're their own line 

1071 if node.kind == NodeKind.LIST: 

1072 if len(parts) > 0: 1072 ↛ 1073line 1072 didn't jump to line 1073 because the condition on line 1072 was never true

1073 yield parts 

1074 parts = [] 

1075 yield [node] 

1076 continue 

1077 if isinstance(node, TemplateNode) and node.template_name in ( 1077 ↛ 1084line 1077 didn't jump to line 1084 because the condition on line 1077 was never true

1078 # Ignore specific templates, like {{((}} that bookends a column. 

1079 "((", 

1080 "))", 

1081 "clear", 

1082 "κλείδα-ελλ", 

1083 ): 

1084 continue 

1085 parts.append(node) 

1086 else: 

1087 if "\n" in node: 

1088 split_string = node.splitlines() 

1089 for spl in split_string[:-1]: 

1090 if spl: 1090 ↛ 1091line 1090 didn't jump to line 1091 because the condition on line 1090 was never true

1091 parts.append(spl) 

1092 yield parts 

1093 parts = [] 

1094 # special handling for final newline; splitlines ignores it 

1095 if node.endswith("\n"): 

1096 if split_string[-1]: 

1097 parts.append(split_string[-1]) 

1098 yield parts 

1099 parts = [] 

1100 elif split_string[-1]: 1100 ↛ 1068line 1100 didn't jump to line 1068 because the condition on line 1100 was always true

1101 parts.append(split_string[-1]) 

1102 elif node: 1102 ↛ 1068line 1102 didn't jump to line 1068 because the condition on line 1102 was always true

1103 parts.append(node) 

1104 

1105 # yield final parts 

1106 if len(parts) > 0: 1106 ↛ 1107line 1106 didn't jump to line 1107 because the condition on line 1106 was never true

1107 yield parts 

1108 

1109 

1110BOLD_RE = re.compile(r"(__/?[BI]__|, |\. )") 

1111 

1112 

1113def extract_forms_and_tags(tagged_text: str) -> tuple[list[str], list[str]]: 

1114 forms: list[str] = [] 

1115 tags: list[str] = [] 

1116 

1117 # print(f"{tagged_text=}") 

1118 # inside_italics = False 

1119 inside_bold = False 

1120 

1121 for i, t in enumerate(BOLD_RE.split(tagged_text)): 

1122 t = t.strip() 

1123 # print(f"{i}: {t=}") 

1124 if not t: 

1125 continue 

1126 

1127 if i % 2 == 0: 

1128 # Text between splitters 

1129 if inside_bold is True: 1129 ↛ 1130line 1129 didn't jump to line 1130 because the condition on line 1129 was never true

1130 forms.append(t) 

1131 continue 

1132 # Add everything else to raw_tags 

1133 # if inside_italics is True: 

1134 # tags.append(t) 

1135 # continue 

1136 # ". " and ", " just split. They're stripped to "." and "," if 

1137 # this needs to be modified later. 

1138 tags.append(t) 

1139 continue 

1140 match t: 

1141 case "__B__": 1141 ↛ 1142line 1141 didn't jump to line 1142 because the pattern on line 1141 never matched

1142 inside_bold = True 

1143 case "__/B__": 1143 ↛ 1144line 1143 didn't jump to line 1144 because the pattern on line 1143 never matched

1144 inside_bold = False 

1145 # case "__I__": 

1146 # inside_italics = True 

1147 # case "__/I__": 

1148 # inside_italics = False 

1149 

1150 return forms, tags 

1151 

1152 

1153META_RE = re.compile(r"__/?[ILEB]__") 

1154 

1155 

1156def block_has_non_greek_text(text: str) -> bool: 

1157 text = META_RE.sub("", text) 

1158 for t in text.split(): 

1159 for ch in t: 1159 ↛ 1158line 1159 didn't jump to line 1158 because the loop on line 1159 didn't complete

1160 if not ch.isalpha(): 1160 ↛ 1161line 1160 didn't jump to line 1161 because the condition on line 1160 was never true

1161 continue 

1162 if not unicode_name(ch).startswith("GREEK"): 

1163 return True 

1164 break 

1165 return False 

1166 

1167 

1168def combine_senses_with_identical_glosses( 

1169 orig_senses: list[Sense], 

1170) -> list[Sense]: 

1171 glosses_to_senses: dict[tuple[str, ...], list[Sense]] = {} 

1172 senses: list[Sense] = [] 

1173 

1174 found_identical_glosses = False 

1175 

1176 for item in orig_senses: 

1177 glosses_key = tuple(item.glosses) 

1178 if glosses_key not in glosses_to_senses: 1178 ↛ 1181line 1178 didn't jump to line 1181 because the condition on line 1178 was always true

1179 glosses_to_senses[glosses_key] = [item] 

1180 else: 

1181 glosses_to_senses[glosses_key].append(item) 

1182 found_identical_glosses = True 

1183 

1184 if not found_identical_glosses: 1184 ↛ 1187line 1184 didn't jump to line 1187 because the condition on line 1184 was always true

1185 return orig_senses 

1186 

1187 for twinned_senses in glosses_to_senses.values(): 

1188 main_sense = twinned_senses[0] 

1189 for other_sense in twinned_senses[1:]: 

1190 main_sense.merge(other_sense) 

1191 senses.append(main_sense) 

1192 

1193 return senses