Coverage for src/wiktextract/extractor/el/pos.py: 77%

446 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2from collections.abc import Iterator 

3from typing import TypeAlias 

4from unicodedata import name as unicode_name 

5 

6from wikitextprocessor import ( 

7 HTMLNode, 

8 NodeKind, 

9 TemplateArgs, 

10 TemplateNode, 

11 WikiNode, 

12) 

13from wikitextprocessor.parser import LEVEL_KIND_FLAGS 

14 

15from wiktextract import WiktextractContext 

16from wiktextract.page import clean_node 

17from wiktextract.wxr_logging import logger 

18 

19from .head import parse_head 

20from .linkages import process_linkage_section 

21from .models import Example, FormOf, Linkage, Sense, TemplateData, WordEntry 

22from .parse_utils import ( 

23 GREEK_LANGCODES, 

24 Heading, 

25 parse_lower_heading, 

26 remove_duplicate_forms, 

27) 

28from .section_titles import POS_HEADINGS 

29from .table import parse_table, process_inflection_section 

30from .tags_utils import convert_tags_in_sense 

31from .text_utils import ( 

32 ENDING_NUMBER_RE, 

33 normalized_int, 

34) 

35from .translations import process_translations 

36 

37# from wiktextract.wxr_logging import logger 

38 

39 

40def process_pos( 

41 wxr: WiktextractContext, 

42 node: WikiNode, 

43 data: WordEntry, 

44 prev_data: WordEntry | None, # data from the last entry in this language 

45 # the "noun" in "Noun 2" 

46 pos: str, 

47 title: str, 

48 # the "2" in "Noun 2" 

49 pos_tags: list[str], 

50 pos_num: int = -1, 

51) -> WordEntry | None: 

52 """Process a part-of-speech section, like 'Noun'. `data` provides basic 

53 data common with other POS sections, like pronunciation or etymology.""" 

54 

55 # Metadata for different part-of-speech kinds. 

56 # print(f"{pos_title=}, {pos_tags=}, {pos_num=}") 

57 data.pos = pos # the internal/translated name for the POS 

58 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings. 

59 

60 wxr.wtp.start_subsection(title) 

61 

62 # Sound data associated with this POS might be coming from a shared 

63 # section, in which case we've tried to tag the sound data with its 

64 # pos name + number if possible. Filter out stuff that doesn't fit. 

65 # This is actually pretty common, but if the edition has proper hierarchies 

66 # for this, doing this step might be unnecessary. 

67 new_sounds = [] 

68 for sound in data.sounds: 68 ↛ 69line 68 didn't jump to line 69 because the loop on line 68 never started

69 if len(sound.poses) == 0: 

70 # This sound data wasn't tagged with any specific pos section(s), so 

71 # we add it to everything; this is basically the default behavior. 

72 new_sounds.append(sound) 

73 else: 

74 for sound_pos in sound.poses: 

75 m = ENDING_NUMBER_RE.search(sound_pos) 

76 if m is not None: 

77 s_num = normalized_int(m.group(1).strip()) 

78 s_pos = sound_pos[: m.start()].strip().lower() 

79 else: 

80 s_pos = sound_pos.strip().lower() 

81 s_num = -1 

82 sound_meta = POS_HEADINGS[s_pos] 

83 s_pos = sound_meta["pos"] 

84 if s_pos == data.pos and s_num == data.pos_num: 

85 new_sounds.append(sound) 

86 data.sounds = new_sounds 

87 

88 # Get child nodes *except* headings (= LEVEL). 

89 pos_contents = list( 

90 node.invert_find_child(LEVEL_KIND_FLAGS, include_empty_str=True) 

91 # include empty string only for debug printing? 

92 ) 

93 

94 if len(pos_contents) == 0 or ( 94 ↛ 101line 94 didn't jump to line 101 because the condition on line 94 was never true

95 len(pos_contents) == 1 

96 and isinstance(pos_contents[0], str) 

97 # Just a single newline or whitespace after heading. 

98 and not pos_contents[0].strip() 

99 ): 

100 # Most probably a bad article. 

101 wxr.wtp.error( 

102 "No body for Part-of-speech section.", sortid="simple/pos/271" 

103 ) 

104 data.senses.append(Sense(tags=["no-gloss"])) 

105 return data 

106 

107 # split_nodes_to_lines returns lists items on their own 'line' 

108 node_lines = list(split_nodes_to_lines(pos_contents)) 

109 

110 glosses_index = None 

111 glosses_lists = [] 

112 for i, line in enumerate(node_lines): 

113 # Looking at the "rump" after glosses lists starts, it's simplest 

114 # just to pull all the list nodes, and handle them. Anything after 

115 # or inbetween (like categories, extra templates, tables and images) 

116 # can be ignored. 

117 if ( 

118 len(line) == 1 

119 and isinstance(line[0], WikiNode) 

120 and line[0].kind == NodeKind.LIST 

121 and (line[0].sarg != ":") 

122 ): 

123 if glosses_index is None: 123 ↛ 125line 123 didn't jump to line 125 because the condition on line 123 was always true

124 glosses_index = i 

125 glosses_lists.append(line[0]) 

126 

127 if glosses_index is None: 

128 # if nothing found, accept ":" nodes 

129 for i, line in enumerate(node_lines): 

130 if ( 

131 len(line) == 1 

132 and isinstance(line[0], WikiNode) 

133 and line[0].kind == NodeKind.LIST 

134 ): 

135 if glosses_index is None: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true

136 glosses_index = i 

137 glosses_lists.append(line[0]) 

138 

139 if glosses_index is None: 139 ↛ 142line 139 didn't jump to line 142 because the condition on line 139 was never true

140 # Could not find any glosses. 

141 # logger.info(f" //// {wxr.wtp.title}\n MISSING GLOSSES") 

142 wxr.wtp.warning("Missing glosses", sortid="pos/20250121") 

143 data.tags.append("no-gloss") 

144 

145 template_data: list[TemplateData] = [] 

146 category_data: list[str] = [] 

147 table_nodes: list[tuple[str | None, WikiNode]] = [] 

148 # template_depth is used as a nonlocal variable in bold_node_handler 

149 # to gauge how deep inside a top-level template we are; we want to 

150 # collect template data only for the top-level templates that are 

151 # visible in the wikitext, not templates inside templates. 

152 template_depth = 0 

153 top_template_name: str | None = None 

154 

155 def bold_node_handler_fn( 

156 node: WikiNode, 

157 ) -> list[str | WikiNode] | None: 

158 """Insert special markers `__*S__` and `__*E__` around bold nodes so 

159 that the strings can later be split into "head-word" and "tag-words" 

160 parts. Collect incidental stuff, like side-tables, that are often 

161 put around the head.""" 

162 assert isinstance(node, WikiNode) 

163 kind = node.kind 

164 nonlocal template_depth 

165 nonlocal top_template_name 

166 if kind == NodeKind.BOLD or ( 

167 isinstance(node, HTMLNode) 

168 and node.tag == "span" 

169 and "style" in node.attrs 

170 and ( 

171 "bold" in node.attrs["style"] 

172 # Special handling for output for stuff in arabic script 

173 or node.attrs["style"] == "color:black; font-size:200%;" 

174 ) 

175 ): 

176 # These are word forms almost always 

177 return ["__B__", *node.children, "__/B__"] 

178 elif kind == NodeKind.ITALIC: 

179 # These are almost always tag words; often 'kai' isn't italicized, 

180 # for example. 

181 return ["__I__", *node.children, "__/I__"] 

182 elif isinstance(node, TemplateNode): 

183 # Recursively expand templates so that even nodes inside the 

184 # the templates are handled with bold_node_handler. 

185 # Argh. Don't use "node_to_text", that causes bad output... 

186 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node)) 

187 if template_depth == 0: 187 ↛ 201line 187 didn't jump to line 201 because the condition on line 187 was always true

188 # We are looking at a top-level template in the original 

189 # wikitext. 

190 template_data.append( 

191 TemplateData( 

192 name=node.template_name, 

193 args={ 

194 str(k): clean_node(wxr, None, v) 

195 for k, v in node.template_parameters.items() 

196 }, 

197 expansion=expanded, 

198 ) 

199 ) 

200 top_template_name = node.template_name 

201 new_node = wxr.wtp.parse(expanded) 

202 

203 template_depth += 1 

204 ret = wxr.wtp.node_to_text( 

205 new_node, node_handler_fn=bold_node_handler_fn 

206 ) 

207 template_depth -= 1 

208 if template_depth == 0: 208 ↛ 210line 208 didn't jump to line 210 because the condition on line 208 was always true

209 top_template_name = None 

210 return ret 

211 elif kind == NodeKind.LINK: 

212 if not isinstance(node.largs[0][0], str): 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 return None 

214 if node.largs[0][0].startswith("Κατηγορία:"): 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 category_data.append(node.largs[0][0][len("Κατηγορία:") :]) 

216 return [""] 

217 if node.largs[0][0].startswith("Αρχείο:"): 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 return [""] 

219 # Often forms are 'formatted' with links, so let's mark these 

220 # too. 

221 return [ 

222 "__L__", 

223 # unpacking a list-comprehension, unpacking into a list 

224 # seems to be more performant than adding lists together. 

225 *( 

226 wxr.wtp.node_to_text( 

227 node.largs[1:2] or node.largs[0], 

228 node_handler_fn=bold_node_handler_fn, 

229 ) 

230 # output the "visible" half of the link. 

231 ), 

232 # XXX collect link data if it turns out to be important. 

233 "__/L__", 

234 ] 

235 # print(f"{node.largs=}") 

236 

237 elif kind in { 237 ↛ 243line 237 didn't jump to line 243 because the condition on line 237 was never true

238 NodeKind.TABLE, 

239 }: 

240 # XXX Handle tables here 

241 # template depth and top-level template name 

242 nonlocal table_nodes 

243 table_nodes.append((top_template_name, node)) 

244 return [""] 

245 return None 

246 

247 # Get Head Line 

248 # Head *should* be immediately before the glosses... 

249 # print(node_lines[:glosses_index]) 

250 found_head = False 

251 

252 for line in reversed(node_lines[:glosses_index]): 

253 template_data = [] 

254 template_depth = 0 

255 stripped = ( 

256 wxr.wtp.node_to_text(line, node_handler_fn=bold_node_handler_fn) 

257 .removeprefix(":") 

258 .strip() 

259 ) 

260 if not stripped: 

261 continue 

262 if not found_head and parse_head(wxr, data, stripped): 262 ↛ 252line 262 didn't jump to line 252 because the condition on line 262 was always true

263 # print(data) 

264 found_head = True 

265 if not found_head: 265 ↛ 271line 265 didn't jump to line 271 because the condition on line 265 was never true

266 # There are a bunch of Greek Wiktionary articles with POS sections 

267 # without heads, but they seem to always follow ones with heads; 

268 # in this case, the result is just not including any `forms` field 

269 # for these (or copying the previous one). 

270 

271 if prev_data is None: 

272 wxr.wtp.warning( 

273 f"Part of speech missing head: {wxr.wtp.title}", 

274 sortid="pos/460/20250104", 

275 ) 

276 else: 

277 # No head found, copy previous (in this language) 

278 data.forms = [ 

279 form.model_copy(deep=True) for form in prev_data.forms 

280 ] 

281 

282 if len(template_data) > 0: 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true

283 data.head_templates = template_data 

284 # logger.info( 

285 # f" //// {wxr.wtp.title}\n >>>" 

286 # + "\n >>>".join(repr(td) for td in template_data) 

287 # ) 

288 

289 if len(table_nodes) > 0: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true

290 for template_name, table_node in table_nodes: 

291 # XXX template_name 

292 parse_table( 

293 wxr, 

294 table_node, 

295 data, 

296 data.lang_code in GREEK_LANGCODES, 

297 template_name=template_name or "", 

298 ) 

299 

300 data.forms = remove_duplicate_forms(wxr, data.forms) 

301 

302 # Ignore images and files 

303 # 2025-01-17 13:10:10,035 INFO: //// Ηρόδοτος 

304 # //// [[Αρχείο:Herodotus Massimo Inv124478.jpg|200px|thumb|[[προτομή]] του Ηροδότου]] 

305 

306 # Have to ignore {{(( specifically. Creates columns. 

307 # 2025-01-17 13:10:11,059 INFO: //// κάνω 

308 # //// {{((|width=97%}} 

309 

310 # logger.info(f"<<<<<<<<< {wxr.wtp.title}\n<<<" + "\n<<<".join(pparts)) 

311 # see: free -> {{en-verb-'free'}} creates a floating inflection table 

312 # followed by the usual head template 

313 

314 # see: τηλεομοιοτυπία 

315 # '''{{PAGENAME}}''' {{θ}} 

316 # theta is basically {{f|...}} 

317 

318 # see: θηλυκός 

319 # '''{{PAGENAME}}, -ή''' ''και'' '''-ιά, -ό''' 

320 # pagename, -e and -ia, -o, no indication of what these mean 

321 

322 # Ιόνια νησιά 

323 # >>>'''{{PAGENAME}}''' ''πληθυντικός του'' [[ιόνιος|Ιόνιο]] [[νησί]] 

324 # plural of 'Ionian island' 

325 

326 # >>>>>>>>> free 

327 # >>>{{en-adj-r}} # floating table 

328 # >>>{{τ|en|{{PAGENAME}}}}, ''συγκριτικός'' '''freer''', ''υπερθετικός'' '''freest''' 

329 # pretty consistent bolding and italics 

330 

331 # genus 

332 # {{τ|la|{{PAGENAME}}}} {{ο}} ([[γενική]]: generis) (3ης [[κλίση]]ς) 

333 

334 # ουδέτερος 

335 # >>>'''{{PAGENAME}} -η -ο''' 

336 

337 # καφέ 

338 # >>>'''{{PAGENAME}}''' {{ακλ|επίθ}} 

339 # aklitos, uninflected 

340 

341 # καφέ 

342 # >>>[[Αρχείο:Cafe Museum - Vienna (5402363918).jpg|μικρογραφία|τραπέζι σε βιενέζικο '''καφέ''']] 

343 # >>>'''{{PAGENAME}}''' {{ο}} {{ακλ|ουδ}} 

344 # Ignore images 

345 

346 # κρόκος 

347 # >>>{| align="right" 

348 # >>> 

349 # >>>|- 

350 # >>> 

351 # >>>|[[Αρχείο:Crocus sativus1.jpg|thumb|150px|Άνθη '''κρόκου''' (''Crocus sativus'').]] 

352 # >>> 

353 # >>> 

354 # >>>|[[Αρχείο:Raw egg.jpg|thumb|200px|Ο '''κρόκος''' ενός αβγού.]] 

355 # >>> 

356 # >>> 

357 # >>>|} 

358 # >>> 

359 # >>>'''{{PAGENAME}}''' {{α}} 

360 

361 # p 

362 # >>>'''{{PAGENAME}}''' ''πεζό'' (''κεφαλαίο:'' '''{{l|P|la}}''') 

363 # lowercase, uppercase 

364 

365 # Δημόκριτος 

366 # >>>'''{{PAGENAME}}''' 

367 # >>># {{όνομα||α}} 

368 # >>>{{clear}} 

369 # Clear is just formatting to move the line down where there are empty 

370 # margins. 

371 

372 # BIG ASSUMPTION: let us assume that Greek Wiktionary doesn't use templates 

373 # that generate multiline text that is part of head. That is, we can see 

374 # each newline because they are in strings, and when something that does 

375 # generate virtual newlines (list) pops up, that's when the head portion 

376 # ends. 

377 # Greek Wiktionary head sections look like this: 

378 # > Pre-head templates that create side-tables, like inflections 

379 # > Possible formatting templates like {{clear}} that should be ignored 

380 # > Head template last before glosses list 

381 # > Clear again... 

382 # > Glosses list tree, where we can stop. 

383 # We can create "lines" of these by looping over the items in pos_content 

384 # and looking for newlines in strings, because that's where they mainly 

385 # should be (except side-table templates). We handle earlier lines 

386 # differently than the last line before the glosses list, which is the 

387 # head. 

388 

389 # return None 

390 

391 # ====================== 

392 

393 ### Glosses after head ### 

394 # parts = [] 

395 got_senses = False 

396 for lst in glosses_lists: 

397 # Wiktionaries handle glosses the usual way: with numbered lists. 

398 # Each list entry is a gloss, sometimes with subglosses, but with 

399 # Simple English Wiktionary that seems rare. 

400 # logger.debug(f"{lst}") 

401 senses = recurse_glosses(wxr, lst, data) 

402 if len(senses) > 0: 402 ↛ 396line 402 didn't jump to line 396 because the condition on line 402 was always true

403 got_senses = True 

404 data.senses.extend(senses) 

405 

406 if not got_senses and len(glosses_lists) > 0: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true

407 wxr.wtp.error( 

408 "POS had a list, but the list did not return senses.", 

409 sortid="simple/pos/313", 

410 ) 

411 

412 # If there is no list, clump everything into one gloss. 

413 # if not len(glosses_lists > 0): 

414 # sense = Sense() 

415 # found_gloss = parse_gloss(wxr, sense, pos_contents[i:]) 

416 # if found_gloss is True or len(sense.raw_tags) > 0: 

417 # convert_tags_in_sense(sense) 

418 # if len(sense.glosses) == 0 and "no-gloss" not in sense.tags: 

419 # sense.tags.append("no-gloss") 

420 # data.senses.append(sense) 

421 

422 if len(data.senses) == 0: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 data.senses.append(Sense(tags=["no-gloss"])) 

424 

425 ##### 

426 ##### 

427 # TEMP DEBUG PRINTS 

428 

429 pos_sublevels = list( 

430 node.find_child(LEVEL_KIND_FLAGS) 

431 # include empty string only for debug printing? 

432 ) 

433 

434 for sl in pos_sublevels: 434 ↛ 435line 434 didn't jump to line 435 because the loop on line 434 never started

435 subtitle = clean_node(wxr, None, sl.largs[0]).lower().strip() 

436 

437 type, pos, heading_name, tags, num, ok = parse_lower_heading( 

438 wxr, subtitle 

439 ) 

440 

441 if type == Heading.Translations: 

442 process_translations(wxr, data, sl) 

443 elif type == Heading.Infl: 

444 process_inflection_section(wxr, data, sl) 

445 elif type in ( 

446 Heading.Related, 

447 Heading.Synonyms, 

448 Heading.Antonyms, 

449 Heading.Transliterations, 

450 ): 

451 process_linkage_section(wxr, data, sl, type) 

452 # if type not in ( 

453 # Heading.Translations, 

454 # Heading.Ignored, 

455 # Heading.Infl, 

456 # Heading.Related, 

457 # Heading.Synonyms, 

458 # Heading.Antonyms, 

459 # Heading.Derived, 

460 # # We're going to ignore homonyms because they're 

461 # # only tangentially related, like anagrams 

462 # Heading.Homonyms, 

463 # ): 

464 # # ... 

465 # expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(sl)) 

466 # # text = clean_node(wxr, None, sl) 

467 # logger.warning( 

468 # f""" 

469 # {wxr.wtp.title}: {type}, '{heading_name}', {ok=} 

470 # {expanded} 

471 

472 # ########################### 

473 # """ 

474 # ) 

475 

476 ##### 

477 ##### 

478 return data 

479 

480 

481PARENS_BEFORE_RE = re.compile(r"\s*(\([^()]+\)\s*)+") 

482ITER_PARENS_RE = re.compile(r"\(([^()]+)\)") 

483 

484 

485def bold_node_fn( 

486 node: WikiNode, 

487) -> list[str | WikiNode] | None: 

488 """Handle nodes in the parse tree specially.""" 

489 # print(f"{node=}") 

490 if node.kind == NodeKind.ITALIC: 

491 return ["__I__", *node.children, "__/I__"] 

492 if node.kind == NodeKind.BOLD: 

493 return ["__B__", *node.children, "__/B__"] 

494 # if node.kind == NodeKind.LINK: 

495 # if not isinstance(node.largs[0][0], str): 

496 # return None 

497 # return [ 

498 # "__L__", 

499 # # unpacking a list-comprehension, unpacking into a list 

500 # # seems to be more performant than adding lists together. 

501 # *( 

502 # wxr.wtp.node_to_text( 

503 # node.largs[1:2] or node.largs[0], 

504 # ) 

505 # # output the "visible" half of the link. 

506 # ), 

507 # # XXX collect link data if it turns out to be important. 

508 # "__/L__", 

509 # ] 

510 # # print(f"{node.largs=}") 

511 return None 

512 

513 

514def extract_form_of_templates( 

515 wxr: WiktextractContext, parent_sense: Sense, t_node: TemplateNode 

516) -> None: 

517 """Parse form_of for nouns, adjectives and verbs. 

518 

519 Supports: 

520 * κλ | generic | form_of 

521 * πτώση/πτώσεις | nouns, adjectives etc. | form_of and tags 

522 * ρημ τύπος | verbs | form_of 

523 * μτχ | verbs | form_of 

524 

525 * References: 

526 https://el.wiktionary.org/wiki/Πρότυπο:κλ 

527 https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_κλιτικούς_τύπους 

528 https://el.wiktionary.org/wiki/Πρότυπο:ρημ_τύπος 

529 https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_μετοχές 

530 """ 

531 t_name = t_node.template_name 

532 

533 # Generic 

534 if t_name == "κλ": 

535 t_args = t_node.template_parameters 

536 if 2 not in t_args: 536 ↛ 537line 536 didn't jump to line 537 because the condition on line 536 was never true

537 wxr.wtp.warning( 

538 "Form-of template does not have lemma data: " 

539 f"{t_name}, {t_args=}", 

540 sortid="pos/535/20250416", 

541 ) 

542 return 

543 lemma = clean_node(wxr, None, t_args[2]) 

544 form_of = FormOf(word=lemma) 

545 parent_sense.form_of.append(form_of) 

546 

547 # Nouns and adjectives 

548 inflection_t_names = ("πτώσεις", "πτώση") 

549 if any(name in t_name for name in inflection_t_names): 

550 return extract_form_of_templates_ptosi(wxr, parent_sense, t_node) 

551 

552 # Verbs 

553 if t_name == "ρημ τύπος": 

554 t_args = t_node.template_parameters 

555 if 2 not in t_args: 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true

556 wxr.wtp.warning( 

557 "Form-of template does not have lemma data: " 

558 f"{t_name}, {t_args=}", 

559 sortid="pos/535/20250416", 

560 ) 

561 return 

562 lemma = clean_node(wxr, None, t_args[2]) 

563 form_of = FormOf(word=lemma) 

564 parent_sense.form_of.append(form_of) 

565 if t_name.startswith("μτχ"): 

566 t_args = t_node.template_parameters 

567 if 1 not in t_args: 567 ↛ 568line 567 didn't jump to line 568 because the condition on line 567 was never true

568 wxr.wtp.warning( 

569 "Form-of template does not have lemma data: " 

570 f"{t_name}, {t_args=}", 

571 sortid="pos/570/20250517", 

572 ) 

573 return 

574 lemma = clean_node(wxr, None, t_args[1]) 

575 form_of = FormOf(word=lemma) 

576 parent_sense.form_of.append(form_of) 

577 

578 

579def extract_form_of_templates_ptosi( 

580 wxr: WiktextractContext, parent_sense: Sense, t_node: TemplateNode 

581) -> None: 

582 """Parse form_of for nouns and adjectives. 

583 

584 Supports: 

585 * [gender του] πτώση-πτώσεις templates 

586 

587 Notes: 

588 * πτώση has exactly one case, πτώσεις as at least two cases 

589 """ 

590 t_name = t_node.template_name 

591 inflection_t_names = ("πτώσεις", "πτώση") 

592 tags: list[str] = [] 

593 

594 # Parse and consume gender if any 

595 if "-" in t_name: 

596 # Cf. {{ουδ του-πτώσειςΟΑΚεν|καλός|grc}} 

597 gender, inflection = t_name.split("-") 

598 code = gender[:3] 

599 GENDER_INFLECTION_MAP = { 

600 "θηλ": "feminine", 

601 "αρσ": "masculine", 

602 "ουδ": "neuter", 

603 } 

604 try: 

605 gender_tag = GENDER_INFLECTION_MAP[code] 

606 except KeyError: 

607 # Bad template name. 

608 return 

609 tags.append(gender_tag) 

610 else: 

611 inflection = t_name 

612 

613 # Remove πτώση-πτώσεις prefix 

614 for prefix in inflection_t_names: 614 ↛ 619line 614 didn't jump to line 619 because the loop on line 614 didn't complete

615 if inflection.startswith(prefix): 

616 inflection = inflection[len(prefix) :] 

617 break 

618 

619 PTOSI_INFLECTION_MAP = { 

620 "Ο": "nominative", 

621 "Α": "accusative", 

622 "Γ": "genitive", 

623 "Κ": "vocative", 

624 } 

625 

626 # The πτώση-πτώσεις templates contains: 

627 # * Case(s) (1 for πτώση, >1 for πτώσεις) in uppercase characters. 

628 # * Number in either "εν" (singular) or "πλ" (plural) 

629 # 

630 # Examples: 

631 # * {{πτώσηΑεν|κόρφος}} > accusative | singular 

632 # * {{πτώσειςΟΚπλ|κόρφος}} > nominative, vocative | plural 

633 try: 

634 lowercase = "".join(ch for ch in inflection if ch.islower()) 

635 number = {"εν": "singular", "πλ": "plural"}[lowercase] 

636 uppercase = [ch for ch in inflection if not ch.islower()] 

637 cases = [PTOSI_INFLECTION_MAP[ch] for ch in uppercase] 

638 except KeyError: 

639 # Bad template name. 

640 return 

641 

642 tags.extend([elt for elt in cases + [number]]) 

643 

644 t_args = t_node.template_parameters 

645 

646 if 1 not in t_args: 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true

647 wxr.wtp.warning( 

648 f"Form-of template does not have lemma data: {t_name}, {t_args=}", 

649 sortid="pos/620/20250416", 

650 ) 

651 return 

652 

653 lemma = clean_node(wxr, None, t_args[1]) 

654 form_of = FormOf(word=lemma) 

655 parent_sense.form_of.append(form_of) 

656 tags.sort() # For the tests, but also good practice 

657 parent_sense.tags.extend(tags) 

658 

659 

660def parse_gloss( 

661 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode] 

662) -> bool: 

663 """Take what is preferably a line of text and extract tags and a gloss from 

664 it. The data is inserted into parent_sense, and for recursion purposes 

665 we return a boolean that tells whether there was any gloss text in a 

666 lower node.""" 

667 if len(contents) == 0: 667 ↛ 668line 667 didn't jump to line 668 because the condition on line 667 was never true

668 return False 

669 

670 for t_node in contents: 

671 if isinstance(t_node, TemplateNode): 

672 extract_form_of_templates(wxr, parent_sense, t_node) 

673 

674 template_tags: list[str] = [] 

675 

676 bl_linkages: list[Linkage] = [] 

677 no_gloss_but_keep_anyway = False 

678 

679 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None: 

680 nonlocal bl_linkages 

681 if name == "βλ": 

682 for k, v in ht.items(): 

683 if isinstance(k, int): 

684 bl_linkages.append(Linkage(word=clean_node(wxr, None, v))) 

685 return "" 

686 return None 

687 

688 # The rest of the text. 

689 text = clean_node( 

690 wxr, 

691 parent_sense, 

692 contents, 

693 template_fn=bl_template_handler_fn, 

694 node_handler_fn=bold_node_fn, 

695 ) 

696 

697 if len(bl_linkages) > 0: 

698 parent_sense.related.extend(bl_linkages) 

699 no_gloss_but_keep_anyway = True 

700 

701 if not text.strip(): 

702 if len(bl_linkages) <= 0: 702 ↛ 703line 702 didn't jump to line 703 because the condition on line 702 was never true

703 return False 

704 

705 # print(f" ============ {contents=}, {text=}") 

706 

707 # Greek Wiktionary uses a lot of template-less tags. 

708 if parens_n := PARENS_BEFORE_RE.match(text): 

709 blocks = ITER_PARENS_RE.findall(parens_n.group(0)) 

710 # print(f"{blocks=}") 

711 kept_blocks: list[str] = [] 

712 forms: list[str] = [] 

713 raw_tag_texts: list[str] = [] 

714 for block in blocks: 

715 if block_has_non_greek_text(block): 

716 # Keep parentheses with non-greek text with gloss text) 

717 kept_blocks.extend(("(", block, ") ")) 

718 continue 

719 nforms, nraw_tag_texts = extract_forms_and_tags(block) 

720 forms.extend(nforms) 

721 raw_tag_texts.extend(nraw_tag_texts) 

722 # print(f"{forms=}, {raw_tag_texts=}") 

723 if forms: 723 ↛ 725line 723 didn't jump to line 725 because the condition on line 723 was never true

724 # print(f"{forms=}") 

725 parent_sense.related.extend(Linkage(word=form) for form in forms) 

726 parent_sense.raw_tags.extend(raw_tag_texts) 

727 kept_blocks.append(text[parens_n.end() :]) 

728 text = "".join(kept_blocks) 

729 

730 text = re.sub(r"__/?[IB]__", "", text) 

731 

732 if len(template_tags) > 0: 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true

733 parent_sense.raw_tags.extend(template_tags) 

734 

735 if len(text) > 0: 

736 parent_sense.glosses.append(text) 

737 return True 

738 

739 if no_gloss_but_keep_anyway: 739 ↛ 743line 739 didn't jump to line 743 because the condition on line 739 was always true

740 parent_sense.raw_tags.append("no-gloss") 

741 return True 

742 

743 return False 

744 

745 

746Related: TypeAlias = Linkage 

747Synonym: TypeAlias = Linkage 

748Antonym: TypeAlias = Linkage 

749 

750 

751def recurse_glosses1( 

752 wxr: WiktextractContext, 

753 parent_sense: Sense, 

754 node: WikiNode, 

755) -> tuple[ 

756 list[Sense], 

757 list[Example], 

758 list[Related], 

759 list[Synonym], 

760 list[Antonym], 

761]: 

762 """Helper function for recurse_glosses""" 

763 # print(f"{node=}") 

764 

765 ret_senses: list[Sense] = [] 

766 ret_examples: list[Example] = [] 

767 ret_related: list[Related] = [] 

768 ret_synonyms: list[Synonym] = [] 

769 ret_antonyms: list[Antonym] = [] 

770 found_gloss = False 

771 

772 # Pydantic stuff doesn't play nice with Tatu's manual dict manipulation 

773 # functions, so we'll use a dummy dict here that we then check for 

774 # content and apply to `parent_sense`. 

775 dummy_parent: dict = {} 

776 

777 related_linkages: list[Linkage] = [] 

778 example_is_synonym = False 

779 example_is_antonym = False 

780 

781 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None: 

782 nonlocal related_linkages 

783 nonlocal example_is_synonym 

784 nonlocal example_is_antonym 

785 # Sometimes the bl-templates point to synonyms or antonyms, instead 

786 # of just "related"; we save them, and if example_is_xxxnym is true, 

787 # we later return them as xxxnyms. 

788 if name == "βλ": 

789 for k, v in ht.items(): 

790 if isinstance(k, int): 

791 related_linkages.append( 

792 Linkage(word=clean_node(wxr, None, v)) 

793 ) 

794 return "" 

795 if name == "συνων": 

796 example_is_synonym = True 

797 return "" 

798 if name == "αντων": 

799 example_is_antonym = True 

800 return "" 

801 return None 

802 

803 # List nodes contain only LIST_ITEM nodes, which may contain sub-LIST nodes. 

804 if node.kind == NodeKind.LIST: 

805 list_ret: tuple[ 

806 list[Sense], 

807 list[Example], 

808 list[Related], 

809 list[Synonym], 

810 list[Antonym], 

811 ] = ([], [], [], [], []) 

812 for child in node.children: 

813 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 813 ↛ 815line 813 didn't jump to line 815 because the condition on line 813 was never true

814 # This should never happen 

815 wxr.wtp.error( 

816 f"{child=} is direct child of NodeKind.LIST", 

817 sortid="simple/pos/44", 

818 ) 

819 continue 

820 ( 

821 senses, 

822 examples, 

823 related, 

824 synonyms, 

825 antonyms, 

826 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child) 

827 list_ret[0].extend(senses) 

828 list_ret[1].extend(examples) 

829 list_ret[2].extend(related) 

830 list_ret[3].extend(synonyms) 

831 list_ret[4].extend(antonyms) 

832 return list_ret 

833 

834 elif node.kind == NodeKind.LIST_ITEM: 834 ↛ 941line 834 didn't jump to line 941 because the condition on line 834 was always true

835 # Split at first LIST node found 

836 split_at = next( 

837 ( 

838 i 

839 for i, c in enumerate(node.children) 

840 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST 

841 ), 

842 len(node.children), 

843 ) 

844 contents = node.children[:split_at] 

845 sublists = node.children[split_at:] 

846 

847 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like 

848 # `#` or `##:`: the token that appears at the very start of a line that 

849 # is used to parse the depth and structure of lists. 

850 # `#` Item 1 

851 # `##` Item 1.1 

852 # `##*` Example 1.1 

853 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"): 

854 # This is either a quotation or example. 

855 text = clean_node( 

856 wxr, dummy_parent, contents, template_fn=bl_template_handler_fn 

857 ).strip("⮡ \n") 

858 

859 # print(f"{contents=}, {text=}, {related_linkages=}") 

860 

861 

862 if example_is_synonym or example_is_antonym: 

863 link_linkages = [] 

864 for snode in contents: 

865 if not isinstance(snode, WikiNode): 

866 continue 

867 if snode.kind == NodeKind.LINK: 

868 link_linkages.append( 

869 Linkage( 

870 word=clean_node(wxr, None, snode.largs[0][0]) 

871 ) 

872 ) 

873 else: 

874 for link in snode.find_child_recursively(NodeKind.LINK): 874 ↛ 875line 874 didn't jump to line 875 because the loop on line 874 never started

875 link_linkages.append( 

876 Linkage( 

877 word=clean_node( 

878 wxr, None, snode.largs[0][0] 

879 ) 

880 ) 

881 ) 

882 

883 # print("=====") 

884 # print(f"{link_linkages=}") 

885 

886 if example_is_synonym: 

887 return [], [], [], link_linkages + related_linkages, [] 

888 elif example_is_antonym: 888 ↛ 891line 888 didn't jump to line 891 because the condition on line 888 was always true

889 return [], [], [], [], link_linkages + related_linkages 

890 

891 if len(related_linkages) > 0: 

892 # parent_sense.related.extend(bl_linkages) 

893 # related_linkages = [] 

894 # if not text.strip(): 

895 return [], [], related_linkages, [], [] 

896 

897 example_is_synonym = False 

898 example_is_antonym = False 

899 

900 if not text.strip(): 900 ↛ 901line 900 didn't jump to line 901 because the condition on line 900 was never true

901 return [], [], [], [], [] 

902 

903 example = Example(text=text) 

904 # logger.debug(f"{wxr.wtp.title}/example\n{text}") 

905 if len(sublists) > 0: 

906 translation = clean_node(wxr, dummy_parent, sublists).strip( 

907 "#*: \n" 

908 ) 

909 if translation != "": 909 ↛ 912line 909 didn't jump to line 912 because the condition on line 909 was always true

910 example.translation = translation 

911 

912 for k, v in dummy_parent.items(): 912 ↛ 913line 912 didn't jump to line 913 because the loop on line 912 never started

913 if k == "categories": 

914 parent_sense.categories.extend(v) 

915 dummy_parent = {} 

916 

917 return [], [example], [], [], [] 

918 

919 found_gloss = parse_gloss(wxr, parent_sense, contents) 

920 

921 for sl in sublists: 

922 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 922 ↛ 924line 922 didn't jump to line 924 because the condition on line 922 was never true

923 # Should not happen 

924 wxr.wtp.error( 

925 f"Sublist is not NodeKind.LIST: {sublists=!r}", 

926 sortid="simple/pos/82", 

927 ) 

928 continue 

929 ( 

930 senses, 

931 examples, 

932 related, 

933 synonyms, 

934 antonyms, 

935 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), sl) 

936 ret_senses.extend(senses) 

937 ret_examples.extend(examples) 

938 ret_related.extend(related) 

939 ret_synonyms.extend(synonyms) 

940 ret_antonyms.extend(antonyms) 

941 if len(ret_senses) > 0: 

942 # the recursion returned actual senses from below, so we will 

943 # ignore everything else (incl. any example data that might have 

944 # been given to parent_sense) and return that instead. 

945 # XXX if this becomes relevant, add the example data to a returned 

946 # subsense instead? 

947 # if any( 

948 # isinstance(r, Sense) and r.raw_tags == ["no-gloss"] for r in ret 

949 # ): 

950 # print(f"{ret=}") 

951 return ( 

952 combine_senses_with_identical_glosses(ret_senses), 

953 [], 

954 [], 

955 [], 

956 [], 

957 ) 

958 

959 # If nothing came from below, then this. 

960 if found_gloss is True or "no-gloss" in parent_sense.raw_tags: 960 ↛ 968line 960 didn't jump to line 968 because the condition on line 960 was always true

961 parent_sense.examples.extend(ret_examples) 

962 parent_sense.related.extend(ret_related) 

963 parent_sense.synonyms.extend(ret_synonyms) 

964 parent_sense.antonyms.extend(ret_antonyms) 

965 

966 return [parent_sense], [], [], [], [] 

967 

968 return [], ret_examples, ret_related, ret_synonyms, ret_antonyms 

969 

970 

971def recurse_glosses( 

972 wxr: WiktextractContext, node: WikiNode, data: WordEntry 

973) -> list[Sense]: 

974 """Recurse through WikiNodes to find glosses and sense-related data.""" 

975 base_sense = Sense() 

976 ret: list[Sense] = [] 

977 

978 senses, examples, related, synonyms, antonyms = recurse_glosses1( 

979 wxr, base_sense, node 

980 ) 

981 if ( 981 ↛ 987line 981 didn't jump to line 987 because the condition on line 981 was never true

982 len(examples) > 0 

983 or len(related) > 0 

984 or len(synonyms) > 0 

985 or len(antonyms) > 0 

986 ): 

987 wxr.wtp.error( 

988 "NOT Sense has bubbled to recurse_glosses: " 

989 f"{examples=}, {related=}, {synonyms=}, {antonyms=}", 

990 sortid="pos/glosses/966", 

991 ) 

992 for sense in senses: 

993 convert_tags_in_sense(sense) 

994 ret.append(sense) 

995 

996 return ret 

997 

998 

999def split_nodes_to_lines( 

1000 nodes: list[WikiNode | str], 

1001) -> Iterator[list[WikiNode | str]]: 

1002 """Take a list of nodes and split up the list into lines. 

1003 This could be done by using node_to_wikitext() to reverse the parsing, 

1004 and then you could parse the individual lines after splitting the text, 

1005 but it seems unnecessary in the context of Greek Wiktionary PoS sections. 

1006 """ 

1007 parts: list[WikiNode | str] = [] 

1008 for node in nodes: 

1009 if isinstance(node, WikiNode): 

1010 # Lists are returned as whole, they're their own line 

1011 if node.kind == NodeKind.LIST: 

1012 if len(parts) > 0: 1012 ↛ 1013line 1012 didn't jump to line 1013 because the condition on line 1012 was never true

1013 yield parts 

1014 parts = [] 

1015 yield [node] 

1016 continue 

1017 if isinstance(node, TemplateNode) and node.template_name in ( 1017 ↛ 1024line 1017 didn't jump to line 1024 because the condition on line 1017 was never true

1018 # Ignore specific templates, like {{((}} that bookends a column. 

1019 "((", 

1020 "))", 

1021 "clear", 

1022 "κλείδα-ελλ", 

1023 ): 

1024 continue 

1025 parts.append(node) 

1026 else: 

1027 if "\n" in node: 

1028 split_string = node.splitlines() 

1029 for spl in split_string[:-1]: 

1030 if spl: 1030 ↛ 1031line 1030 didn't jump to line 1031 because the condition on line 1030 was never true

1031 parts.append(spl) 

1032 yield parts 

1033 parts = [] 

1034 # special handling for final newline; splitlines ignores it 

1035 if node.endswith("\n"): 

1036 if split_string[-1]: 

1037 parts.append(split_string[-1]) 

1038 yield parts 

1039 parts = [] 

1040 elif split_string[-1]: 1040 ↛ 1008line 1040 didn't jump to line 1008 because the condition on line 1040 was always true

1041 parts.append(split_string[-1]) 

1042 elif node: 1042 ↛ 1008line 1042 didn't jump to line 1008 because the condition on line 1042 was always true

1043 parts.append(node) 

1044 

1045 # yield final parts 

1046 if len(parts) > 0: 1046 ↛ 1047line 1046 didn't jump to line 1047 because the condition on line 1046 was never true

1047 yield parts 

1048 

1049 

1050BOLD_RE = re.compile(r"(__/?[BI]__|, |\. )") 

1051 

1052 

1053def extract_forms_and_tags(tagged_text: str) -> tuple[list[str], list[str]]: 

1054 forms: list[str] = [] 

1055 tags: list[str] = [] 

1056 

1057 # print(f"{tagged_text=}") 

1058 # inside_italics = False 

1059 inside_bold = False 

1060 

1061 for i, t in enumerate(BOLD_RE.split(tagged_text)): 

1062 t = t.strip() 

1063 # print(f"{i}: {t=}") 

1064 if not t: 

1065 continue 

1066 

1067 if i % 2 == 0: 

1068 # Text between splitters 

1069 if inside_bold is True: 1069 ↛ 1070line 1069 didn't jump to line 1070 because the condition on line 1069 was never true

1070 forms.append(t) 

1071 continue 

1072 # Add everything else to raw_tags 

1073 # if inside_italics is True: 

1074 # tags.append(t) 

1075 # continue 

1076 # ". " and ", " just split. They're stripped to "." and "," if 

1077 # this needs to be modified later. 

1078 tags.append(t) 

1079 continue 

1080 match t: 

1081 case "__B__": 1081 ↛ 1082line 1081 didn't jump to line 1082 because the pattern on line 1081 never matched

1082 inside_bold = True 

1083 case "__/B__": 1083 ↛ 1084line 1083 didn't jump to line 1084 because the pattern on line 1083 never matched

1084 inside_bold = False 

1085 # case "__I__": 

1086 # inside_italics = True 

1087 # case "__/I__": 

1088 # inside_italics = False 

1089 

1090 return forms, tags 

1091 

1092 

1093META_RE = re.compile(r"__/?[ILEB]__") 

1094 

1095 

1096def block_has_non_greek_text(text: str) -> bool: 

1097 text = META_RE.sub("", text) 

1098 for t in text.split(): 

1099 for ch in t: 1099 ↛ 1098line 1099 didn't jump to line 1098 because the loop on line 1099 didn't complete

1100 if not ch.isalpha(): 1100 ↛ 1101line 1100 didn't jump to line 1101 because the condition on line 1100 was never true

1101 continue 

1102 if not unicode_name(ch).startswith("GREEK"): 

1103 return True 

1104 break 

1105 return False 

1106 

1107 

1108def combine_senses_with_identical_glosses( 

1109 orig_senses: list[Sense], 

1110) -> list[Sense]: 

1111 glosses_to_senses: dict[tuple[str, ...], list[Sense]] = {} 

1112 senses: list[Sense] = [] 

1113 

1114 found_identical_glosses = False 

1115 

1116 for item in orig_senses: 

1117 glosses_key = tuple(item.glosses) 

1118 if glosses_key not in glosses_to_senses: 1118 ↛ 1121line 1118 didn't jump to line 1121 because the condition on line 1118 was always true

1119 glosses_to_senses[glosses_key] = [item] 

1120 else: 

1121 glosses_to_senses[glosses_key].append(item) 

1122 found_identical_glosses = True 

1123 

1124 if not found_identical_glosses: 1124 ↛ 1127line 1124 didn't jump to line 1127 because the condition on line 1124 was always true

1125 return orig_senses 

1126 

1127 for twinned_senses in glosses_to_senses.values(): 

1128 main_sense = twinned_senses[0] 

1129 for other_sense in twinned_senses[1:]: 

1130 main_sense.merge(other_sense) 

1131 senses.append(main_sense) 

1132 

1133 return senses