Coverage for src/wiktextract/extractor/el/pos.py: 80%

465 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2from collections.abc import Iterator 

3from functools import partial 

4from typing import Any, TypeAlias 

5from unicodedata import name as unicode_name 

6 

7from wikitextprocessor import ( 

8 HTMLNode, 

9 NodeKind, 

10 TemplateArgs, 

11 TemplateNode, 

12 WikiNode, 

13) 

14from wikitextprocessor.parser import LEVEL_KIND_FLAGS 

15 

16from wiktextract import WiktextractContext 

17from wiktextract.extractor.el.tags import translate_raw_tags 

18from wiktextract.page import clean_node 

19 

20from .head import parse_head 

21from .linkages import process_linkage_section 

22from .models import ( 

23 Example, 

24 FormOf, 

25 FormSource, 

26 Linkage, 

27 Sense, 

28 TemplateData, 

29 WordEntry, 

30) 

31from .parse_utils import ( 

32 GREEK_LANGCODES, 

33 expand_suffix_forms, 

34 parse_lower_heading, 

35 remove_duplicate_forms, 

36) 

37from .section_titles import POS_HEADINGS, Heading, POSName 

38from .table import parse_table, process_inflection_section, remove_article_forms 

39from .tags_utils import convert_tags_in_sense 

40from .text_utils import ( 

41 ENDING_NUMBER_RE, 

42 normalized_int, 

43) 

44from .translations import process_translations 

45 

46# from wiktextract.wxr_logging import logger 

47 

48 

49def process_pos( 

50 wxr: WiktextractContext, 

51 node: WikiNode, 

52 data: WordEntry, 

53 prev_data: WordEntry | None, # data from the last entry in this language 

54 # the "noun" in "Noun 2" 

55 pos: POSName, 

56 title: str, 

57 # the "2" in "Noun 2" 

58 pos_tags: list[str], 

59 pos_num: int = -1, 

60) -> WordEntry | None: 

61 """Process a part-of-speech section, like 'Noun'. `data` provides basic 

62 data common with other POS sections, like pronunciation or etymology.""" 

63 

64 # Metadata for different part-of-speech kinds. 

65 # print(f"{pos_title=}, {pos_tags=}, {pos_num=}") 

66 data.pos = pos # the internal/translated name for the POS 

67 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings. 

68 for pos_tag in pos_tags: 

69 if pos_tag not in data.tags: 69 ↛ 68line 69 didn't jump to line 68 because the condition on line 69 was always true

70 data.tags.append(pos_tag) 

71 

72 wxr.wtp.start_subsection(title) 

73 

74 # Sound data associated with this POS might be coming from a shared 

75 # section, in which case we've tried to tag the sound data with its 

76 # pos name + number if possible. Filter out stuff that doesn't fit. 

77 # This is actually pretty common, but if the edition has proper hierarchies 

78 # for this, doing this step might be unnecessary. 

79 new_sounds = [] 

80 for sound in data.sounds: 80 ↛ 81line 80 didn't jump to line 81 because the loop on line 80 never started

81 if len(sound.poses) == 0: 

82 # This sound data wasn't tagged with any specific pos section(s), so 

83 # we add it to everything; this is basically the default behavior. 

84 new_sounds.append(sound) 

85 else: 

86 for sound_pos in sound.poses: 

87 m = ENDING_NUMBER_RE.search(sound_pos) 

88 if m is not None: 

89 s_num = normalized_int(m.group(1).strip()) 

90 s_pos = sound_pos[: m.start()].strip().lower() 

91 else: 

92 s_pos = sound_pos.strip().lower() 

93 s_num = -1 

94 sound_meta = POS_HEADINGS[s_pos] 

95 s_pos = sound_meta["pos"] 

96 if s_pos == data.pos and s_num == data.pos_num: 

97 new_sounds.append(sound) 

98 data.sounds = new_sounds 

99 

100 # Get child nodes *except* headings (= LEVEL). 

101 pos_contents = list( 

102 node.invert_find_child(LEVEL_KIND_FLAGS, include_empty_str=True) 

103 # include empty string only for debug printing? 

104 ) 

105 

106 if len(pos_contents) == 0 or ( 106 ↛ 113line 106 didn't jump to line 113 because the condition on line 106 was never true

107 len(pos_contents) == 1 

108 and isinstance(pos_contents[0], str) 

109 # Just a single newline or whitespace after heading. 

110 and not pos_contents[0].strip() 

111 ): 

112 # Most probably a bad article. 

113 wxr.wtp.error( 

114 "No body for Part-of-speech section.", sortid="simple/pos/271" 

115 ) 

116 data.senses.append(Sense(tags=["no-gloss"])) 

117 return data 

118 

119 # split_nodes_to_lines returns lists items on their own 'line' 

120 node_lines = list(split_nodes_to_lines(pos_contents)) 

121 

122 glosses_index = None 

123 glosses_lists = [] 

124 for i, line in enumerate(node_lines): 

125 # Looking at the "rump" after glosses lists starts, it's simplest 

126 # just to pull all the list nodes, and handle them. Anything after 

127 # or inbetween (like categories, extra templates, tables and images) 

128 # can be ignored. 

129 if ( 

130 len(line) == 1 

131 and isinstance(line[0], WikiNode) 

132 and line[0].kind == NodeKind.LIST 

133 and (line[0].sarg != ":") 

134 ): 

135 if glosses_index is None: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true

136 glosses_index = i 

137 glosses_lists.append(line[0]) 

138 

139 if glosses_index is None: 

140 # if nothing found, accept ":" nodes 

141 for i, line in enumerate(node_lines): 

142 if ( 

143 len(line) == 1 

144 and isinstance(line[0], WikiNode) 

145 and line[0].kind == NodeKind.LIST 

146 ): 

147 if glosses_index is None: 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true

148 glosses_index = i 

149 glosses_lists.append(line[0]) 

150 

151 if glosses_index is None: 151 ↛ 154line 151 didn't jump to line 154 because the condition on line 151 was never true

152 # Could not find any glosses. 

153 # logger.info(f" //// {wxr.wtp.title}\n MISSING GLOSSES") 

154 wxr.wtp.wiki_notice("Missing glosses", sortid="pos/20250121") 

155 data.tags.append("no-gloss") 

156 

157 template_data: list[TemplateData] = [] 

158 category_data: list[str] = [] 

159 table_nodes: list[tuple[str | None, WikiNode]] = [] 

160 # template_depth is used as a nonlocal variable in bold_node_handler 

161 # to gauge how deep inside a top-level template we are; we want to 

162 # collect template data only for the top-level templates that are 

163 # visible in the wikitext, not templates inside templates. 

164 template_depth = 0 

165 top_template_name: str | None = None 

166 

167 def bold_node_handler_fn( 

168 node: WikiNode, 

169 ) -> list[str | WikiNode] | str | None: 

170 """Insert special markers `__*S__` and `__*E__` around bold nodes so 

171 that the strings can later be split into "head-word" and "tag-words" 

172 parts. Collect incidental stuff, like side-tables, that are often 

173 put around the head.""" 

174 assert isinstance(node, WikiNode) 

175 kind = node.kind 

176 nonlocal template_depth 

177 nonlocal top_template_name 

178 if kind == NodeKind.BOLD or ( 

179 isinstance(node, HTMLNode) 

180 and ( 

181 node.tag == "span" 

182 and "style" in node.attrs 

183 and ( 

184 "bold" in node.attrs["style"] 

185 # Special handling for output for stuff in arabic script 

186 or node.attrs["style"] == "color:black; font-size:200%;" 

187 ) 

188 or node.tag == "b" 

189 or node.tag == "strong" 

190 ) 

191 ): 

192 # These are word forms almost always 

193 return ["__B__", *node.children, "__/B__"] 

194 elif kind == NodeKind.ITALIC or ( 

195 isinstance(node, HTMLNode) 

196 and ( 

197 ( 

198 node.tag == "span" 

199 and "style" in node.attrs 

200 and "italic" in node.attrs["style"] 

201 ) 

202 or node.tag == "i" 

203 or node.tag == "em" 

204 ) 

205 ): 

206 # These are almost always tag words; often 'kai' isn't italicized, 

207 # for example. 

208 return ["__I__", *node.children, "__/I__"] 

209 elif isinstance(node, TemplateNode): 

210 # Recursively expand templates so that even nodes inside the 

211 # the templates are handled with bold_node_handler. 

212 # Argh. Don't use "node_to_text", that causes bad output... 

213 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node)) 

214 if template_depth == 0: 214 ↛ 228line 214 didn't jump to line 228 because the condition on line 214 was always true

215 # We are looking at a top-level template in the original 

216 # wikitext. 

217 template_data.append( 

218 TemplateData( 

219 name=node.template_name, 

220 args={ 

221 str(k): clean_node(wxr, None, v) 

222 for k, v in node.template_parameters.items() 

223 }, 

224 expansion=expanded, 

225 ) 

226 ) 

227 top_template_name = node.template_name 

228 new_node = wxr.wtp.parse(expanded) 

229 

230 template_depth += 1 

231 ret = wxr.wtp.node_to_text( 

232 new_node, node_handler_fn=bold_node_handler_fn 

233 ) 

234 template_depth -= 1 

235 if template_depth == 0: 235 ↛ 237line 235 didn't jump to line 237 because the condition on line 235 was always true

236 top_template_name = None 

237 return ret 

238 elif kind == NodeKind.LINK: 

239 if not isinstance(node.largs[0][0], str): 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 return None 

241 if node.largs[0][0].startswith("Κατηγορία:"): 

242 category_data.append(node.largs[0][0][len("Κατηγορία:") :]) 

243 return [""] 

244 # Special case for meta-links like Πρότυπο:ετ that generate 

245 # both a category link and :category link that is actually 

246 # displayed as a link, but for our purposes we want to ignore 

247 # that it is a link; it's a tag. 

248 if node.largs[0][0].startswith(":Κατηγορία:"): 

249 # unpacking a list-comprehension, unpacking into a list 

250 # seems to be more performant than adding lists together. 

251 return [ 

252 wxr.wtp.node_to_text( 

253 node.largs[1:2] or node.largs[0], 

254 node_handler_fn=bold_node_handler_fn, 

255 ) 

256 # output the "visible" half of the link. 

257 ] 

258 if node.largs[0][0].startswith("Αρχείο:"): 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true

259 return [""] 

260 # Often forms are 'formatted' with links, so let's mark these 

261 # too. 

262 return [ 

263 "__L__", 

264 wxr.wtp.node_to_text( 

265 node.largs[1:2] or node.largs[0], 

266 node_handler_fn=bold_node_handler_fn, 

267 ), 

268 # output the "visible" half of the link. 

269 # XXX collect link data if it turns out to be important. 

270 "__/L__", 

271 ] 

272 # print(f"{node.largs=}") 

273 

274 elif kind in { 274 ↛ 280line 274 didn't jump to line 280 because the condition on line 274 was never true

275 NodeKind.TABLE, 

276 }: 

277 # XXX Handle tables here 

278 # template depth and top-level template name 

279 nonlocal table_nodes 

280 table_nodes.append((top_template_name, node)) 

281 return [""] 

282 return None 

283 

284 # Get Head Line 

285 # Head *should* be immediately before the glosses... 

286 # print(node_lines[:glosses_index]) 

287 found_head = False 

288 

289 for line in reversed(node_lines[:glosses_index]): 

290 template_data = [] 

291 template_depth = 0 

292 stripped = ( 

293 wxr.wtp.node_to_text(line, node_handler_fn=bold_node_handler_fn) 

294 .removeprefix(":") 

295 .strip() 

296 ) 

297 if not stripped: 

298 continue 

299 if not found_head and (parsed_forms := parse_head(wxr, stripped)): 299 ↛ 289line 299 didn't jump to line 289 because the condition on line 299 was always true

300 for form in parsed_forms: 

301 translate_raw_tags(form) 

302 

303 if ( 

304 data.lang_code == "el" 

305 and not data.word.startswith("-") 

306 # If there are spaces around the "/", we don't parse the 

307 # header correctly, so just skip the expansion. 

308 # Ex. "πρωτοπόρος, -α / -ος, -ο" 

309 # Remove this check if that ever gets fixed. 

310 and len(parsed_forms) == 3 

311 # Only adjectives or participles 

312 and ( 

313 data.pos == "adj" 

314 or (data.pos == "verb" and "participle" in data.tags) 

315 ) 

316 ): 

317 parsed_forms = expand_suffix_forms(parsed_forms) 

318 

319 parsed_forms = remove_article_forms(parsed_forms, data.word) 

320 data.forms.extend(parsed_forms) 

321 found_head = True 

322 

323 if not found_head: 323 ↛ 329line 323 didn't jump to line 329 because the condition on line 323 was never true

324 # There are a bunch of Greek Wiktionary articles with POS sections 

325 # without heads, but they seem to always follow ones with heads; 

326 # in this case, the result is just not including any `forms` field 

327 # for these (or copying the previous one). 

328 

329 if prev_data is None: 

330 wxr.wtp.wiki_notice( 

331 f"Part of speech missing head: {wxr.wtp.title}", 

332 sortid="pos/460/20250104", 

333 ) 

334 else: 

335 # No head found, copy previous (in this language) 

336 data.forms = [ 

337 form.model_copy(deep=True) for form in prev_data.forms 

338 ] 

339 

340 if len(template_data) > 0: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true

341 data.head_templates = template_data 

342 # logger.info( 

343 # f" //// {wxr.wtp.title}\n >>>" 

344 # + "\n >>>".join(repr(td) for td in template_data) 

345 # ) 

346 

347 for template_name, table_node in table_nodes: 347 ↛ 349line 347 didn't jump to line 349 because the loop on line 347 never started

348 # XXX template_name 

349 parse_table( 

350 wxr, 

351 table_node, 

352 data, 

353 data.lang_code in GREEK_LANGCODES, 

354 template_name=template_name or "", 

355 source="inflection", 

356 ) 

357 

358 data.forms = remove_duplicate_forms(wxr, data.forms) 

359 

360 # Ignore images and files 

361 # 2025-01-17 13:10:10,035 INFO: //// Ηρόδοτος 

362 # //// [[Αρχείο:Herodotus Massimo Inv124478.jpg|200px|thumb|[[προτομή]] του Ηροδότου]] 

363 

364 # Have to ignore {{(( specifically. Creates columns. 

365 # 2025-01-17 13:10:11,059 INFO: //// κάνω 

366 # //// {{((|width=97%}} 

367 

368 # logger.info(f"<<<<<<<<< {wxr.wtp.title}\n<<<" + "\n<<<".join(pparts)) 

369 # see: free -> {{en-verb-'free'}} creates a floating inflection table 

370 # followed by the usual head template 

371 

372 # see: τηλεομοιοτυπία 

373 # '''{{PAGENAME}}''' {{θ}} 

374 # theta is basically {{f|...}} 

375 

376 # see: θηλυκός 

377 # '''{{PAGENAME}}, -ή''' ''και'' '''-ιά, -ό''' 

378 # pagename, -e and -ia, -o, no indication of what these mean 

379 

380 # Ιόνια νησιά 

381 # >>>'''{{PAGENAME}}''' ''πληθυντικός του'' [[ιόνιος|Ιόνιο]] [[νησί]] 

382 # plural of 'Ionian island' 

383 

384 # >>>>>>>>> free 

385 # >>>{{en-adj-r}} # floating table 

386 # >>>{{τ|en|{{PAGENAME}}}}, ''συγκριτικός'' '''freer''', ''υπερθετικός'' '''freest''' 

387 # pretty consistent bolding and italics 

388 

389 # genus 

390 # {{τ|la|{{PAGENAME}}}} {{ο}} ([[γενική]]: generis) (3ης [[κλίση]]ς) 

391 

392 # ουδέτερος 

393 # >>>'''{{PAGENAME}} -η -ο''' 

394 

395 # καφέ 

396 # >>>'''{{PAGENAME}}''' {{ακλ|επίθ}} 

397 # aklitos, uninflected 

398 

399 # καφέ 

400 # >>>[[Αρχείο:Cafe Museum - Vienna (5402363918).jpg|μικρογραφία|τραπέζι σε βιενέζικο '''καφέ''']] 

401 # >>>'''{{PAGENAME}}''' {{ο}} {{ακλ|ουδ}} 

402 # Ignore images 

403 

404 # κρόκος 

405 # >>>{| align="right" 

406 # >>> 

407 # >>>|- 

408 # >>> 

409 # >>>|[[Αρχείο:Crocus sativus1.jpg|thumb|150px|Άνθη '''κρόκου''' (''Crocus sativus'').]] 

410 # >>> 

411 # >>> 

412 # >>>|[[Αρχείο:Raw egg.jpg|thumb|200px|Ο '''κρόκος''' ενός αβγού.]] 

413 # >>> 

414 # >>> 

415 # >>>|} 

416 # >>> 

417 # >>>'''{{PAGENAME}}''' {{α}} 

418 

419 # p 

420 # >>>'''{{PAGENAME}}''' ''πεζό'' (''κεφαλαίο:'' '''{{l|P|la}}''') 

421 # lowercase, uppercase 

422 

423 # Δημόκριτος 

424 # >>>'''{{PAGENAME}}''' 

425 # >>># {{όνομα||α}} 

426 # >>>{{clear}} 

427 # Clear is just formatting to move the line down where there are empty 

428 # margins. 

429 

430 # BIG ASSUMPTION: let us assume that Greek Wiktionary doesn't use templates 

431 # that generate multiline text that is part of head. That is, we can see 

432 # each newline because they are in strings, and when something that does 

433 # generate virtual newlines (list) pops up, that's when the head portion 

434 # ends. 

435 # Greek Wiktionary head sections look like this: 

436 # > Pre-head templates that create side-tables, like inflections 

437 # > Possible formatting templates like {{clear}} that should be ignored 

438 # > Head template last before glosses list 

439 # > Clear again... 

440 # > Glosses list tree, where we can stop. 

441 # We can create "lines" of these by looping over the items in pos_content 

442 # and looking for newlines in strings, because that's where they mainly 

443 # should be (except side-table templates). We handle earlier lines 

444 # differently than the last line before the glosses list, which is the 

445 # head. 

446 

447 # return None 

448 

449 # ====================== 

450 

451 ### Glosses after head ### 

452 # parts = [] 

453 got_senses = False 

454 for lst in glosses_lists: 

455 # Wiktionaries handle glosses the usual way: with numbered lists. 

456 # Each list entry is a gloss, sometimes with subglosses, but with 

457 # Simple English Wiktionary that seems rare. 

458 # logger.debug(f"{lst}") 

459 senses = recurse_glosses(wxr, lst, data) 

460 if len(senses) > 0: 460 ↛ 454line 460 didn't jump to line 454 because the condition on line 460 was always true

461 got_senses = True 

462 for sense in senses: 

463 translate_raw_tags(sense) 

464 data.senses.extend(senses) 

465 

466 if not got_senses and len(glosses_lists) > 0: 466 ↛ 467line 466 didn't jump to line 467 because the condition on line 466 was never true

467 wxr.wtp.error( 

468 "POS had a list, but the list did not return senses.", 

469 sortid="simple/pos/313", 

470 ) 

471 

472 # If there is no list, clump everything into one gloss. 

473 # if not len(glosses_lists > 0): 

474 # sense = Sense() 

475 # found_gloss = parse_gloss(wxr, sense, pos_contents[i:]) 

476 # if found_gloss is True or len(sense.raw_tags) > 0: 

477 # convert_tags_in_sense(sense) 

478 # if len(sense.glosses) == 0 and "no-gloss" not in sense.tags: 

479 # sense.tags.append("no-gloss") 

480 # data.senses.append(sense) 

481 

482 if len(data.senses) == 0: 482 ↛ 483line 482 didn't jump to line 483 because the condition on line 482 was never true

483 data.senses.append(Sense(tags=["no-gloss"])) 

484 

485 ##### 

486 ##### 

487 # TEMP DEBUG PRINTS 

488 

489 pos_sublevels = list( 

490 node.find_child(LEVEL_KIND_FLAGS) 

491 # include empty string only for debug printing? 

492 ) 

493 

494 for sl in pos_sublevels: 494 ↛ 495line 494 didn't jump to line 495 because the loop on line 494 never started

495 subtitle = clean_node(wxr, None, sl.largs).lower().strip() 

496 

497 heading_type, *_ = parse_lower_heading(wxr, subtitle) 

498 

499 if heading_type == Heading.Translations: 

500 process_translations(wxr, data, sl) 

501 elif heading_type == Heading.Infl: 

502 source: FormSource = "inflection" 

503 if data.lang_code in ("el", "grc"): 

504 source = "conjugation" 

505 process_inflection_section(wxr, data, sl, source=source) 

506 elif heading_type in ( 

507 Heading.Related, 

508 Heading.Synonyms, 

509 Heading.Antonyms, 

510 Heading.Transliterations, 

511 ): 

512 process_linkage_section(wxr, data, sl, heading_type) 

513 # if heading_type not in ( 

514 # Heading.Translations, 

515 # Heading.Ignored, 

516 # Heading.Infl, 

517 # Heading.Related, 

518 # Heading.Synonyms, 

519 # Heading.Antonyms, 

520 # Heading.Derived, 

521 # # We're going to ignore homonyms because they're 

522 # # only tangentially related, like anagrams 

523 # Heading.Homonyms, 

524 # ): 

525 # # ... 

526 # expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(sl)) 

527 # # text = clean_node(wxr, None, sl) 

528 # logger.warning( 

529 # f""" 

530 # {wxr.wtp.title}: {heading_type}, {ok=} 

531 # {expanded} 

532 

533 # ########################### 

534 # """ 

535 # ) 

536 

537 ##### 

538 ##### 

539 return data 

540 

541 

542PARENS_BEFORE_RE = re.compile(r"\s*(\([^()]+\)\s*)+") 

543ITER_PARENS_RE = re.compile(r"\(([^()]+)\)") 

544 

545 

546def bold_node_fn( 

547 node: WikiNode, 

548) -> list[str | WikiNode] | None: 

549 """Handle nodes in the parse tree specially.""" 

550 # print(f"{node=}") 

551 if node.kind == NodeKind.ITALIC: 

552 return ["__I__", *node.children, "__/I__"] 

553 if node.kind == NodeKind.BOLD: 

554 return ["__B__", *node.children, "__/B__"] 

555 # if node.kind == NodeKind.LINK: 

556 # if not isinstance(node.largs[0][0], str): 

557 # return None 

558 # return [ 

559 # "__L__", 

560 # # unpacking a list-comprehension, unpacking into a list 

561 # # seems to be more performant than adding lists together. 

562 # *( 

563 # wxr.wtp.node_to_text( 

564 # node.largs[1:2] or node.largs[0], 

565 # ) 

566 # # output the "visible" half of the link. 

567 # ), 

568 # # XXX collect link data if it turns out to be important. 

569 # "__/L__", 

570 # ] 

571 # # print(f"{node.largs=}") 

572 return None 

573 

574 

575def extract_form_of_templates( 

576 wxr: WiktextractContext, 

577 parent_sense: Sense | WordEntry, 

578 t_node: TemplateNode, 

579 siblings: list[str | WikiNode], 

580 siblings_index: int, 

581) -> None: 

582 """Parse form_of for nouns, adjectives and verbs. 

583 

584 Supports: 

585 1. κλ | generic | form_of 

586 2. γρ | generic | form_of 

587 3. πτώση/πτώσεις | nouns, adjectives etc. | form_of and tags 

588 4. υπο/υποκ | nouns | form_of 

589 5. μεγ/μεγεθ | nouns | form_of 

590 6. ρημ τύπος | verbs | form_of 

591 7. μτχ | verbs | form_of 

592 

593 * References: 

594 1. https://el.wiktionary.org/wiki/Πρότυπο:κλ 

595 2. https://el.wiktionary.org/wiki/Module:άλλημορφή 

596 3. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_κλιτικούς_τύπους 

597 4. https://el.wiktionary.org/wiki/Πρότυπο:υπο 

598 5. https://el.wiktionary.org/wiki/Πρότυπο:μεγ 

599 6. https://el.wiktionary.org/wiki/Πρότυπο:ρημ_τύπος 

600 7. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_μετοχές 

601 """ 

602 t_name = t_node.template_name 

603 

604 basic_extract = partial( 

605 extract_form_of_templates_basic, 

606 wxr, 

607 parent_sense, 

608 siblings, 

609 siblings_index, 

610 t_name, 

611 t_node, 

612 ) 

613 # Generic 

614 if t_name == "κλ": 

615 return basic_extract(extract_argument=2) 

616 

617 # Notes: 

618 # * All occurrences in wiktionary have at least one argument 

619 # * Only handle cases where the second argument refers to a form: 

620 # μορφ / μορφή / λόγια μορφή του, etc. 

621 # and ignore those mistakenly used as synonym templates 

622 if ( 

623 t_name in ("γρ", "γραφή του", "alter") 

624 and 2 in t_node.template_parameters 

625 ): 

626 second_arg = t_node.template_parameters[2] 

627 second_arg_str = clean_node(wxr, None, second_arg) 

628 if "μορφ" in second_arg_str: 

629 return basic_extract(extract_argument=1) 

630 

631 # Nouns and adjectives 

632 inflection_t_names = ("πτώσεις", "πτώση") 

633 if ( 

634 any(name in t_name for name in inflection_t_names) 

635 and 1 in t_node.template_parameters 

636 ): 

637 return extract_form_of_templates_ptosi(wxr, parent_sense, t_node) 

638 

639 # Nouns 

640 # Note that the "diminutive/augmentative" tags will be added later on 

641 # via translation of the "υποκοριστικό/μεγεθυντικό" raw_tags 

642 for template_name in ("υπο", "υποκ", "μεγ", "μεγεθ"): 

643 if t_name == template_name and 1 in t_node.template_parameters: 

644 return basic_extract(extract_argument=1) 

645 

646 # Verbs 

647 if t_name == "ρημ τύπος": 

648 return basic_extract(extract_argument=2) 

649 

650 if t_name.startswith("μτχ"): 

651 return basic_extract(extract_argument=1) 

652 

653 

654def extract_form_of_templates_basic( 

655 wxr: WiktextractContext, 

656 parent_sense: Sense | WordEntry, 

657 siblings: list[str | WikiNode], 

658 sibling_index: int, 

659 t_name: str, 

660 t_node: TemplateNode, 

661 extract_argument: int | str, 

662) -> None: 

663 t_args = t_node.template_parameters 

664 if extract_argument in t_args: 

665 lemma = clean_node(wxr, None, t_args[extract_argument]).strip() 

666 else: 

667 # mtxpp template has no args, consume the next links for the 

668 # form_of field 

669 # cf. https://github.com/tatuylonen/wiktextract/issues/1372 

670 wxr.wtp.wiki_notice( 

671 f"Form-of template does not have lemma data: {t_name}, {t_args=}", 

672 sortid="pos/570/20250517", 

673 ) 

674 links: list[str | WikiNode] = [] 

675 for node in siblings[sibling_index + 1 :]: 

676 if not ( 

677 (isinstance(node, str) and node.strip() == "") 

678 or (isinstance(node, WikiNode) and node.kind == NodeKind.LINK) 

679 ): 

680 break 

681 links.append(node) 

682 lemma = clean_node(wxr, None, links).strip() 

683 

684 if lemma: 

685 form_of = FormOf(word=lemma) 

686 parent_sense.form_of.append(form_of) 

687 else: 

688 wxr.wtp.wiki_notice( 

689 "Lemma extract from form-of template was empty or whitespace:" 

690 f"{t_name}, {t_args=}, {lemma=}", 

691 sortid="pos/609/20250925", 

692 ) 

693 

694 

695PTOSI_GENDER_INFLECTION_MAP = { 

696 "θηλ": "feminine", 

697 "αρσ": "masculine", 

698 "ουδ": "neuter", 

699} 

700PTOSI_NUMBER_INFLECTION_MAP = { 

701 "εν": "singular", 

702 "πλ": "plural", 

703} 

704PTOSI_CASE_INFLECTION_MAP = { 

705 "Ο": "nominative", 

706 "Α": "accusative", 

707 "Γ": "genitive", 

708 "Κ": "vocative", 

709} 

710 

711 

712def extract_form_of_templates_ptosi( 

713 wxr: WiktextractContext, 

714 parent_sense: Sense | WordEntry, 

715 t_node: TemplateNode, 

716) -> None: 

717 """Parse form_of for nouns and adjectives. 

718 

719 Supports: 

720 * [gender του] πτώση-πτώσεις templates 

721 

722 Notes: 

723 * The πτώση-πτώσεις templates contains: 

724 * Case(s): 1 for πτώση, >1 for πτώσεις - in uppercase characters. 

725 * Number: "εν" (singular) or "πλ" (plural) 

726 Examples: 

727 * {{πτώσηΑεν|κόρφος}} > accusative | singular 

728 * {{πτώσειςΟΚπλ|κόρφος}} > nominative, vocative | plural 

729 """ 

730 t_name = t_node.template_name 

731 inflection_t_names = ("πτώσεις", "πτώση") 

732 tags: list[str] = [] 

733 

734 # Parse and consume gender if any 

735 if "-" in t_name: 

736 # Cf. {{ουδ του-πτώσειςΟΑΚεν|καλός|grc}} 

737 gender, inflection = t_name.split("-") 

738 code = gender[:3] 

739 try: 

740 gender_tag = PTOSI_GENDER_INFLECTION_MAP[code] 

741 except KeyError: 

742 # Bad template name. 

743 return 

744 tags.append(gender_tag) 

745 else: 

746 inflection = t_name 

747 

748 # Remove πτώση-πτώσεις prefix 

749 for prefix in inflection_t_names: 749 ↛ 754line 749 didn't jump to line 754 because the loop on line 749 didn't complete

750 if inflection.startswith(prefix): 

751 inflection = inflection[len(prefix) :] 

752 break 

753 

754 try: 

755 lowercase = "".join(ch for ch in inflection if ch.islower()) 

756 number = PTOSI_NUMBER_INFLECTION_MAP[lowercase] 

757 uppercase = [ch for ch in inflection if not ch.islower()] 

758 cases = [PTOSI_CASE_INFLECTION_MAP[ch] for ch in uppercase] 

759 except KeyError: 

760 # Bad template name. 

761 return 

762 

763 tags.extend([*cases, number]) 

764 tags.sort() # For the tests, but also good practice 

765 

766 lemma = clean_node(wxr, None, t_node.template_parameters[1]) 

767 form_of = FormOf(word=lemma) 

768 parent_sense.form_of.append(form_of) 

769 parent_sense.tags.extend(tags) 

770 

771 

772def parse_gloss( 

773 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode] 

774) -> bool: 

775 """Take what is preferably a line of text and extract tags and a gloss from 

776 it. The data is inserted into parent_sense, and for recursion purposes 

777 we return a boolean that tells whether there was any gloss text in a 

778 lower node.""" 

779 if len(contents) == 0: 779 ↛ 780line 779 didn't jump to line 780 because the condition on line 779 was never true

780 return False 

781 

782 for i, t_node in enumerate(contents): 

783 if isinstance(t_node, TemplateNode): 

784 extract_form_of_templates(wxr, parent_sense, t_node, contents, i) 

785 

786 template_tags: list[str] = [] 

787 

788 bl_linkages: list[Linkage] = [] 

789 no_gloss_but_keep_anyway = False 

790 

791 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None: 

792 nonlocal bl_linkages 

793 if name == "βλ": 

794 for k, v in ht.items(): 

795 if isinstance(k, int): 

796 bl_linkages.append(Linkage(word=clean_node(wxr, None, v))) 

797 return "" 

798 return None 

799 

800 # The rest of the text. 

801 text = clean_node( 

802 wxr, 

803 parent_sense, 

804 contents, 

805 template_fn=bl_template_handler_fn, 

806 node_handler_fn=bold_node_fn, 

807 ) 

808 

809 if len(bl_linkages) > 0: 

810 parent_sense.related.extend(bl_linkages) 

811 no_gloss_but_keep_anyway = True 

812 

813 if not text.strip(): 

814 if len(bl_linkages) <= 0: 814 ↛ 815line 814 didn't jump to line 815 because the condition on line 814 was never true

815 return False 

816 

817 # print(f" ============ {contents=}, {text=}") 

818 

819 # Greek Wiktionary uses a lot of template-less tags. 

820 if parens_n := PARENS_BEFORE_RE.match(text): 

821 blocks = ITER_PARENS_RE.findall(parens_n.group(0)) 

822 # print(f"{blocks=}") 

823 kept_blocks: list[str] = [] 

824 forms: list[str] = [] 

825 raw_tag_texts: list[str] = [] 

826 for block in blocks: 

827 if block_has_non_greek_text(block): 

828 # Keep parentheses with non-greek text with gloss text) 

829 kept_blocks.extend(("(", block, ") ")) 

830 continue 

831 nforms, nraw_tag_texts = extract_forms_and_tags(block) 

832 forms.extend(nforms) 

833 raw_tag_texts.extend(nraw_tag_texts) 

834 # print(f"{forms=}, {raw_tag_texts=}") 

835 if forms: 835 ↛ 837line 835 didn't jump to line 837 because the condition on line 835 was never true

836 # print(f"{forms=}") 

837 parent_sense.related.extend(Linkage(word=form) for form in forms) 

838 parent_sense.raw_tags.extend(raw_tag_texts) 

839 kept_blocks.append(text[parens_n.end() :]) 

840 text = "".join(kept_blocks) 

841 

842 text = re.sub(r"__/?[IB]__", "", text) 

843 

844 if len(template_tags) > 0: 844 ↛ 845line 844 didn't jump to line 845 because the condition on line 844 was never true

845 parent_sense.raw_tags.extend(template_tags) 

846 

847 if len(text) > 0: 

848 parent_sense.glosses.append(text) 

849 return True 

850 

851 if no_gloss_but_keep_anyway: 851 ↛ 855line 851 didn't jump to line 855 because the condition on line 851 was always true

852 parent_sense.tags.append("no-gloss") 

853 return True 

854 

855 return False 

856 

857 

858Related: TypeAlias = Linkage 

859Synonym: TypeAlias = Linkage 

860Antonym: TypeAlias = Linkage 

861 

862 

863def recurse_glosses1( 

864 wxr: WiktextractContext, 

865 parent_sense: Sense, 

866 node: WikiNode, 

867) -> tuple[ 

868 list[Sense], 

869 list[Example], 

870 list[Related], 

871 list[Synonym], 

872 list[Antonym], 

873]: 

874 """Helper function for recurse_glosses""" 

875 # print(f"{node=}") 

876 

877 ret_senses: list[Sense] = [] 

878 ret_examples: list[Example] = [] 

879 ret_related: list[Related] = [] 

880 ret_synonyms: list[Synonym] = [] 

881 ret_antonyms: list[Antonym] = [] 

882 found_gloss = False 

883 

884 # Pydantic stuff doesn't play nice with Tatu's manual dict manipulation 

885 # functions, so we'll use a dummy dict here that we then check for 

886 # content and apply to `parent_sense`. 

887 dummy_parent: dict[str, Any] = {} 

888 

889 related_linkages: list[Linkage] = [] 

890 example_is_synonym = False 

891 example_is_antonym = False 

892 

893 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None: 

894 nonlocal related_linkages 

895 nonlocal example_is_synonym 

896 nonlocal example_is_antonym 

897 # Sometimes the bl-templates point to synonyms or antonyms, instead 

898 # of just "related"; we save them, and if example_is_xxxnym is true, 

899 # we later return them as xxxnyms. 

900 if name == "βλ": 

901 for k, v in ht.items(): 

902 if isinstance(k, int): 

903 related_linkages.append( 

904 Linkage(word=clean_node(wxr, None, v)) 

905 ) 

906 return "" 

907 if name in ("συνων", "συνών"): 

908 example_is_synonym = True 

909 return "" 

910 if name in ("αντων", "αντών"): 

911 example_is_antonym = True 

912 return "" 

913 return None 

914 

915 # List nodes contain only LIST_ITEM nodes, which may contain sub-LIST nodes. 

916 if node.kind == NodeKind.LIST: 

917 list_ret: tuple[ 

918 list[Sense], 

919 list[Example], 

920 list[Related], 

921 list[Synonym], 

922 list[Antonym], 

923 ] = ([], [], [], [], []) 

924 for child in node.children: 

925 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 925 ↛ 927line 925 didn't jump to line 927 because the condition on line 925 was never true

926 # This should never happen 

927 wxr.wtp.error( 

928 f"{child=} is direct child of NodeKind.LIST", 

929 sortid="simple/pos/44", 

930 ) 

931 continue 

932 ( 

933 senses, 

934 examples, 

935 related, 

936 synonyms, 

937 antonyms, 

938 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child) 

939 list_ret[0].extend(senses) 

940 list_ret[1].extend(examples) 

941 list_ret[2].extend(related) 

942 list_ret[3].extend(synonyms) 

943 list_ret[4].extend(antonyms) 

944 return list_ret 

945 

946 elif node.kind == NodeKind.LIST_ITEM: 946 ↛ 1048line 946 didn't jump to line 1048 because the condition on line 946 was always true

947 # Split at first LIST node found 

948 split_at = next( 

949 ( 

950 i 

951 for i, c in enumerate(node.children) 

952 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST 

953 ), 

954 len(node.children), 

955 ) 

956 contents = node.children[:split_at] 

957 sublists = node.children[split_at:] 

958 

959 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like 

960 # `#` or `##:`: the token that appears at the very start of a line that 

961 # is used to parse the depth and structure of lists. 

962 # `#` Item 1 

963 # `##` Item 1.1 

964 # `##*` Example 1.1 

965 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"): 

966 # This is either a quotation or example. 

967 text = clean_node( 

968 wxr, dummy_parent, contents, template_fn=bl_template_handler_fn 

969 ).strip("⮡ \n") 

970 

971 # print(f"{contents=}, {text=}, {related_linkages=}") 

972 

973 if example_is_synonym or example_is_antonym: 

974 link_linkages = [] 

975 for snode in contents: 

976 if not isinstance(snode, WikiNode): 

977 continue 

978 if snode.kind == NodeKind.LINK: 

979 link_linkages.append( 

980 Linkage( 

981 word=clean_node(wxr, None, snode.largs[0][0]) 

982 ) 

983 ) 

984 else: 

985 for link in snode.find_child_recursively(NodeKind.LINK): 985 ↛ 986line 985 didn't jump to line 986 because the loop on line 985 never started

986 link_linkages.append( 

987 Linkage(word=clean_node(wxr, None, link)) 

988 ) 

989 

990 # print("=====") 

991 # print(f"{link_linkages=}") 

992 

993 if example_is_synonym: 

994 return [], [], [], link_linkages + related_linkages, [] 

995 elif example_is_antonym: 995 ↛ 998line 995 didn't jump to line 998 because the condition on line 995 was always true

996 return [], [], [], [], link_linkages + related_linkages 

997 

998 if len(related_linkages) > 0: 

999 # parent_sense.related.extend(bl_linkages) 

1000 # related_linkages = [] 

1001 # if not text.strip(): 

1002 return [], [], related_linkages, [], [] 

1003 

1004 example_is_synonym = False 

1005 example_is_antonym = False 

1006 

1007 if not text.strip(): 1007 ↛ 1008line 1007 didn't jump to line 1008 because the condition on line 1007 was never true

1008 return [], [], [], [], [] 

1009 

1010 example = Example(text=text) 

1011 # logger.debug(f"{wxr.wtp.title}/example\n{text}") 

1012 if len(sublists) > 0: 

1013 translation = clean_node(wxr, dummy_parent, sublists).strip( 

1014 "#*: \n" 

1015 ) 

1016 if translation != "": 1016 ↛ 1019line 1016 didn't jump to line 1019 because the condition on line 1016 was always true

1017 example.translation = translation 

1018 

1019 for k, v in dummy_parent.items(): 1019 ↛ 1020line 1019 didn't jump to line 1020 because the loop on line 1019 never started

1020 if k == "categories": 

1021 parent_sense.categories.extend(v) 

1022 dummy_parent = {} 

1023 

1024 return [], [example], [], [], [] 

1025 

1026 found_gloss = parse_gloss(wxr, parent_sense, contents) 

1027 

1028 for sl in sublists: 

1029 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 1029 ↛ 1031line 1029 didn't jump to line 1031 because the condition on line 1029 was never true

1030 # Should not happen 

1031 wxr.wtp.error( 

1032 f"Sublist is not NodeKind.LIST: {sublists=!r}", 

1033 sortid="simple/pos/82", 

1034 ) 

1035 continue 

1036 ( 

1037 senses, 

1038 examples, 

1039 related, 

1040 synonyms, 

1041 antonyms, 

1042 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), sl) 

1043 ret_senses.extend(senses) 

1044 ret_examples.extend(examples) 

1045 ret_related.extend(related) 

1046 ret_synonyms.extend(synonyms) 

1047 ret_antonyms.extend(antonyms) 

1048 if len(ret_senses) > 0: 

1049 # the recursion returned actual senses from below, so we will 

1050 # ignore everything else (incl. any example data that might have 

1051 # been given to parent_sense) and return that instead. 

1052 # XXX if this becomes relevant, add the example data to a returned 

1053 # subsense instead? 

1054 # if any( 

1055 # isinstance(r, Sense) and r.tags == ["no-gloss"] for r in ret 

1056 # ): 

1057 # print(f"{ret=}") 

1058 return ( 

1059 combine_senses_with_identical_glosses(ret_senses), 

1060 [], 

1061 [], 

1062 [], 

1063 [], 

1064 ) 

1065 

1066 # If nothing came from below, then this. 

1067 if found_gloss is True or "no-gloss" in parent_sense.tags: 1067 ↛ 1075line 1067 didn't jump to line 1075 because the condition on line 1067 was always true

1068 parent_sense.examples.extend(ret_examples) 

1069 parent_sense.related.extend(ret_related) 

1070 parent_sense.synonyms.extend(ret_synonyms) 

1071 parent_sense.antonyms.extend(ret_antonyms) 

1072 

1073 return [parent_sense], [], [], [], [] 

1074 

1075 return [], ret_examples, ret_related, ret_synonyms, ret_antonyms 

1076 

1077 

1078def recurse_glosses( 

1079 wxr: WiktextractContext, node: WikiNode, data: WordEntry 

1080) -> list[Sense]: 

1081 """Recurse through WikiNodes to find glosses and sense-related data.""" 

1082 base_sense = Sense() 

1083 ret: list[Sense] = [] 

1084 

1085 senses, examples, related, synonyms, antonyms = recurse_glosses1( 

1086 wxr, base_sense, node 

1087 ) 

1088 if ( 1088 ↛ 1094line 1088 didn't jump to line 1094 because the condition on line 1088 was never true

1089 len(examples) > 0 

1090 or len(related) > 0 

1091 or len(synonyms) > 0 

1092 or len(antonyms) > 0 

1093 ): 

1094 wxr.wtp.error( 

1095 "NOT Sense has bubbled to recurse_glosses: " 

1096 f"{examples=}, {related=}, {synonyms=}, {antonyms=}", 

1097 sortid="pos/glosses/966", 

1098 ) 

1099 for sense in senses: 

1100 convert_tags_in_sense(sense) 

1101 ret.append(sense) 

1102 

1103 return ret 

1104 

1105 

1106def split_nodes_to_lines( 

1107 nodes: list[WikiNode | str], 

1108) -> Iterator[list[WikiNode | str]]: 

1109 """Take a list of nodes and split up the list into lines. 

1110 This could be done by using node_to_wikitext() to reverse the parsing, 

1111 and then you could parse the individual lines after splitting the text, 

1112 but it seems unnecessary in the context of Greek Wiktionary PoS sections. 

1113 """ 

1114 parts: list[WikiNode | str] = [] 

1115 for node in nodes: 

1116 if isinstance(node, WikiNode): 

1117 # Lists are returned as whole, they're their own line 

1118 if node.kind == NodeKind.LIST: 

1119 if len(parts) > 0: 1119 ↛ 1120line 1119 didn't jump to line 1120 because the condition on line 1119 was never true

1120 yield parts 

1121 parts = [] 

1122 yield [node] 

1123 continue 

1124 if isinstance(node, TemplateNode) and node.template_name in ( 1124 ↛ 1131line 1124 didn't jump to line 1131 because the condition on line 1124 was never true

1125 # Ignore specific templates, like {{((}} that bookends a column. 

1126 "((", 

1127 "))", 

1128 "clear", 

1129 "κλείδα-ελλ", 

1130 ): 

1131 continue 

1132 parts.append(node) 

1133 else: 

1134 if "\n" in node: 

1135 split_string = node.splitlines() 

1136 for spl in split_string[:-1]: 

1137 if spl: 1137 ↛ 1138line 1137 didn't jump to line 1138 because the condition on line 1137 was never true

1138 parts.append(spl) 

1139 yield parts 

1140 parts = [] 

1141 # special handling for final newline; splitlines ignores it 

1142 if node.endswith("\n"): 

1143 if split_string[-1]: 

1144 parts.append(split_string[-1]) 

1145 yield parts 

1146 parts = [] 

1147 elif split_string[-1]: 1147 ↛ 1115line 1147 didn't jump to line 1115 because the condition on line 1147 was always true

1148 parts.append(split_string[-1]) 

1149 elif node: 1149 ↛ 1115line 1149 didn't jump to line 1115 because the condition on line 1149 was always true

1150 parts.append(node) 

1151 

1152 # yield final parts 

1153 if len(parts) > 0: 1153 ↛ 1154line 1153 didn't jump to line 1154 because the condition on line 1153 was never true

1154 yield parts 

1155 

1156 

1157BOLD_RE = re.compile(r"(__/?[BI]__|, |\. )") 

1158 

1159 

1160def extract_forms_and_tags(tagged_text: str) -> tuple[list[str], list[str]]: 

1161 forms: list[str] = [] 

1162 tags: list[str] = [] 

1163 

1164 # print(f"{tagged_text=}") 

1165 # inside_italics = False 

1166 inside_bold = False 

1167 

1168 for i, t in enumerate(BOLD_RE.split(tagged_text)): 

1169 t = t.strip() 

1170 # print(f"{i}: {t=}") 

1171 if not t: 

1172 continue 

1173 

1174 if i % 2 == 0: 

1175 # Text between splitters 

1176 if inside_bold is True: 1176 ↛ 1177line 1176 didn't jump to line 1177 because the condition on line 1176 was never true

1177 forms.append(t) 

1178 continue 

1179 # Add everything else to raw_tags 

1180 # if inside_italics is True: 

1181 # tags.append(t) 

1182 # continue 

1183 # ". " and ", " just split. They're stripped to "." and "," if 

1184 # this needs to be modified later. 

1185 tags.append(t) 

1186 continue 

1187 match t: 

1188 case "__B__": 1188 ↛ 1189line 1188 didn't jump to line 1189 because the pattern on line 1188 never matched

1189 inside_bold = True 

1190 case "__/B__": 1190 ↛ 1191line 1190 didn't jump to line 1191 because the pattern on line 1190 never matched

1191 inside_bold = False 

1192 # case "__I__": 

1193 # inside_italics = True 

1194 # case "__/I__": 

1195 # inside_italics = False 

1196 

1197 return forms, tags 

1198 

1199 

1200META_RE = re.compile(r"__/?[ILEB]__") 

1201 

1202 

1203def block_has_non_greek_text(text: str) -> bool: 

1204 text = META_RE.sub("", text) 

1205 for t in text.split(): 

1206 for ch in t: 1206 ↛ 1205line 1206 didn't jump to line 1205 because the loop on line 1206 didn't complete

1207 if not ch.isalpha(): 1207 ↛ 1208line 1207 didn't jump to line 1208 because the condition on line 1207 was never true

1208 continue 

1209 if not unicode_name(ch).startswith("GREEK"): 

1210 return True 

1211 break 

1212 return False 

1213 

1214 

1215def combine_senses_with_identical_glosses( 

1216 orig_senses: list[Sense], 

1217) -> list[Sense]: 

1218 glosses_to_senses: dict[tuple[str, ...], list[Sense]] = {} 

1219 senses: list[Sense] = [] 

1220 

1221 found_identical_glosses = False 

1222 

1223 for item in orig_senses: 

1224 glosses_key = tuple(item.glosses) 

1225 if glosses_key not in glosses_to_senses: 1225 ↛ 1228line 1225 didn't jump to line 1228 because the condition on line 1225 was always true

1226 glosses_to_senses[glosses_key] = [item] 

1227 else: 

1228 glosses_to_senses[glosses_key].append(item) 

1229 found_identical_glosses = True 

1230 

1231 if not found_identical_glosses: 1231 ↛ 1234line 1231 didn't jump to line 1234 because the condition on line 1231 was always true

1232 return orig_senses 

1233 

1234 for twinned_senses in glosses_to_senses.values(): 

1235 main_sense = twinned_senses[0] 

1236 for other_sense in twinned_senses[1:]: 

1237 main_sense.merge(other_sense) 

1238 senses.append(main_sense) 

1239 

1240 return senses