Coverage for src / wiktextract / extractor / el / pos.py: 81%

468 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from collections.abc import Iterator 

3from functools import partial 

4from typing import Any, TypeAlias 

5from unicodedata import name as unicode_name 

6 

7from wikitextprocessor import ( 

8 HTMLNode, 

9 NodeKind, 

10 TemplateArgs, 

11 TemplateNode, 

12 WikiNode, 

13) 

14from wikitextprocessor.parser import LEVEL_KIND_FLAGS 

15 

16from wiktextract import WiktextractContext 

17from wiktextract.extractor.el.tags import translate_raw_tags 

18from wiktextract.page import clean_node 

19 

20from .head import parse_head 

21from .linkages import process_linkage_section 

22from .models import ( 

23 AltForm, 

24 Example, 

25 FormSource, 

26 Linkage, 

27 Sense, 

28 TemplateData, 

29 WordEntry, 

30) 

31from .parse_utils import ( 

32 GREEK_LANGCODES, 

33 expand_suffix_forms, 

34 parse_lower_heading, 

35 remove_duplicate_forms, 

36) 

37from .section_titles import POS_HEADINGS, Heading, POSName 

38from .table import parse_table, process_inflection_section, remove_article_forms 

39from .tags_utils import convert_tags_in_sense 

40from .text_utils import ( 

41 ENDING_NUMBER_RE, 

42 normalized_int, 

43) 

44from .translations import process_translations 

45 

46# from wiktextract.wxr_logging import logger 

47 

48 

49def process_pos( 

50 wxr: WiktextractContext, 

51 node: WikiNode, 

52 data: WordEntry, 

53 prev_data: WordEntry | None, # data from the last entry in this language 

54 # the "noun" in "Noun 2" 

55 pos: POSName, 

56 title: str, 

57 # the "2" in "Noun 2" 

58 pos_tags: list[str], 

59 pos_num: int = -1, 

60) -> WordEntry | None: 

61 """Process a part-of-speech section, like 'Noun'. `data` provides basic 

62 data common with other POS sections, like pronunciation or etymology.""" 

63 

64 # Metadata for different part-of-speech kinds. 

65 # print(f"{pos_title=}, {pos_tags=}, {pos_num=}") 

66 data.pos = pos # the internal/translated name for the POS 

67 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings. 

68 for pos_tag in pos_tags: 

69 if pos_tag not in data.tags: 69 ↛ 68line 69 didn't jump to line 68 because the condition on line 69 was always true

70 data.tags.append(pos_tag) 

71 

72 wxr.wtp.start_subsection(title) 

73 

74 # Sound data associated with this POS might be coming from a shared 

75 # section, in which case we've tried to tag the sound data with its 

76 # pos name + number if possible. Filter out stuff that doesn't fit. 

77 # This is actually pretty common, but if the edition has proper hierarchies 

78 # for this, doing this step might be unnecessary. 

79 new_sounds = [] 

80 for sound in data.sounds: 80 ↛ 81line 80 didn't jump to line 81 because the loop on line 80 never started

81 if len(sound.poses) == 0: 

82 # This sound data wasn't tagged with any specific pos section(s), so 

83 # we add it to everything; this is basically the default behavior. 

84 new_sounds.append(sound) 

85 else: 

86 for sound_pos in sound.poses: 

87 m = ENDING_NUMBER_RE.search(sound_pos) 

88 if m is not None: 

89 s_num = normalized_int(m.group(1).strip()) 

90 s_pos = sound_pos[: m.start()].strip().lower() 

91 else: 

92 s_pos = sound_pos.strip().lower() 

93 s_num = -1 

94 sound_meta = POS_HEADINGS[s_pos] 

95 s_pos = sound_meta["pos"] 

96 if s_pos == data.pos and s_num == data.pos_num: 

97 new_sounds.append(sound) 

98 data.sounds = new_sounds 

99 

100 # Get child nodes *except* headings (= LEVEL). 

101 pos_contents = list( 

102 node.invert_find_child(LEVEL_KIND_FLAGS, include_empty_str=True) 

103 # include empty string only for debug printing? 

104 ) 

105 

106 if len(pos_contents) == 0 or ( 106 ↛ 113line 106 didn't jump to line 113 because the condition on line 106 was never true

107 len(pos_contents) == 1 

108 and isinstance(pos_contents[0], str) 

109 # Just a single newline or whitespace after heading. 

110 and not pos_contents[0].strip() 

111 ): 

112 # Most probably a bad article. 

113 wxr.wtp.error( 

114 "No body for Part-of-speech section.", sortid="simple/pos/271" 

115 ) 

116 data.senses.append(Sense(tags=["no-gloss"])) 

117 return data 

118 

119 # split_nodes_to_lines returns lists items on their own 'line' 

120 node_lines = list(split_nodes_to_lines(pos_contents)) 

121 

122 glosses_index = None 

123 glosses_lists = [] 

124 for i, line in enumerate(node_lines): 

125 # Looking at the "rump" after glosses lists starts, it's simplest 

126 # just to pull all the list nodes, and handle them. Anything after 

127 # or inbetween (like categories, extra templates, tables and images) 

128 # can be ignored. 

129 if ( 

130 len(line) == 1 

131 and isinstance(line[0], WikiNode) 

132 and line[0].kind == NodeKind.LIST 

133 and (line[0].sarg != ":") 

134 ): 

135 if glosses_index is None: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true

136 glosses_index = i 

137 glosses_lists.append(line[0]) 

138 

139 if glosses_index is None: 

140 # if nothing found, accept ":" nodes 

141 for i, line in enumerate(node_lines): 

142 if ( 

143 len(line) == 1 

144 and isinstance(line[0], WikiNode) 

145 and line[0].kind == NodeKind.LIST 

146 ): 

147 if glosses_index is None: 147 ↛ 149line 147 didn't jump to line 149 because the condition on line 147 was always true

148 glosses_index = i 

149 glosses_lists.append(line[0]) 

150 

151 if glosses_index is None: 151 ↛ 154line 151 didn't jump to line 154 because the condition on line 151 was never true

152 # Could not find any glosses. 

153 # logger.info(f" //// {wxr.wtp.title}\n MISSING GLOSSES") 

154 wxr.wtp.wiki_notice("Missing glosses", sortid="pos/20250121") 

155 data.tags.append("no-gloss") 

156 

157 template_data: list[TemplateData] = [] 

158 category_data: list[str] = [] 

159 table_nodes: list[tuple[str | None, WikiNode]] = [] 

160 # template_depth is used as a nonlocal variable in bold_node_handler 

161 # to gauge how deep inside a top-level template we are; we want to 

162 # collect template data only for the top-level templates that are 

163 # visible in the wikitext, not templates inside templates. 

164 template_depth = 0 

165 top_template_name: str | None = None 

166 

167 def bold_node_handler_fn( 

168 node: WikiNode, 

169 ) -> list[str | WikiNode] | str | None: 

170 """Insert special markers `__*S__` and `__*E__` around bold nodes so 

171 that the strings can later be split into "head-word" and "tag-words" 

172 parts. Collect incidental stuff, like side-tables, that are often 

173 put around the head.""" 

174 assert isinstance(node, WikiNode) 

175 kind = node.kind 

176 nonlocal template_depth 

177 nonlocal top_template_name 

178 if kind == NodeKind.BOLD or ( 

179 isinstance(node, HTMLNode) 

180 and ( 

181 node.tag == "span" 

182 and "style" in node.attrs 

183 and ( 

184 "bold" in node.attrs["style"] 

185 # Special handling for output for stuff in arabic script 

186 or node.attrs["style"] == "color:black; font-size:200%;" 

187 ) 

188 or node.tag == "b" 

189 or node.tag == "strong" 

190 ) 

191 ): 

192 # These are word forms almost always 

193 return ["__B__", *node.children, "__/B__"] 

194 elif kind == NodeKind.ITALIC or ( 

195 isinstance(node, HTMLNode) 

196 and ( 

197 ( 

198 node.tag == "span" 

199 and "style" in node.attrs 

200 and "italic" in node.attrs["style"] 

201 ) 

202 or node.tag == "i" 

203 or node.tag == "em" 

204 ) 

205 ): 

206 # These are almost always tag words; often 'kai' isn't italicized, 

207 # for example. 

208 return ["__I__", *node.children, "__/I__"] 

209 elif isinstance(node, TemplateNode): 

210 # Recursively expand templates so that even nodes inside the 

211 # the templates are handled with bold_node_handler. 

212 # Argh. Don't use "node_to_text", that causes bad output... 

213 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node)) 

214 if template_depth == 0: 214 ↛ 228line 214 didn't jump to line 228 because the condition on line 214 was always true

215 # We are looking at a top-level template in the original 

216 # wikitext. 

217 template_data.append( 

218 TemplateData( 

219 name=node.template_name, 

220 args={ 

221 str(k): clean_node(wxr, None, v) 

222 for k, v in node.template_parameters.items() 

223 }, 

224 expansion=expanded, 

225 ) 

226 ) 

227 top_template_name = node.template_name 

228 new_node = wxr.wtp.parse(expanded) 

229 

230 template_depth += 1 

231 ret = wxr.wtp.node_to_text( 

232 new_node, node_handler_fn=bold_node_handler_fn 

233 ) 

234 template_depth -= 1 

235 if template_depth == 0: 235 ↛ 237line 235 didn't jump to line 237 because the condition on line 235 was always true

236 top_template_name = None 

237 return ret 

238 elif kind == NodeKind.LINK: 

239 if not isinstance(node.largs[0][0], str): 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 return None 

241 if node.largs[0][0].startswith("Κατηγορία:"): 

242 category_data.append(node.largs[0][0][len("Κατηγορία:") :]) 

243 return [""] 

244 # Special case for meta-links like Πρότυπο:ετ that generate 

245 # both a category link and :category link that is actually 

246 # displayed as a link, but for our purposes we want to ignore 

247 # that it is a link; it's a tag. 

248 if node.largs[0][0].startswith(":Κατηγορία:"): 

249 # unpacking a list-comprehension, unpacking into a list 

250 # seems to be more performant than adding lists together. 

251 return [ 

252 wxr.wtp.node_to_text( 

253 node.largs[1:2] or node.largs[0], 

254 node_handler_fn=bold_node_handler_fn, 

255 ) 

256 # output the "visible" half of the link. 

257 ] 

258 if node.largs[0][0].startswith("Αρχείο:"): 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true

259 return [""] 

260 # Often forms are 'formatted' with links, so let's mark these 

261 # too. 

262 return [ 

263 "__L__", 

264 wxr.wtp.node_to_text( 

265 node.largs[1:2] or node.largs[0], 

266 node_handler_fn=bold_node_handler_fn, 

267 ), 

268 # output the "visible" half of the link. 

269 # XXX collect link data if it turns out to be important. 

270 "__/L__", 

271 ] 

272 # print(f"{node.largs=}") 

273 

274 elif kind in { 274 ↛ 280line 274 didn't jump to line 280 because the condition on line 274 was never true

275 NodeKind.TABLE, 

276 }: 

277 # XXX Handle tables here 

278 # template depth and top-level template name 

279 nonlocal table_nodes 

280 table_nodes.append((top_template_name, node)) 

281 return [""] 

282 return None 

283 

284 # Get Head Line 

285 # Head *should* be immediately before the glosses... 

286 # print(node_lines[:glosses_index]) 

287 found_head = False 

288 

289 for line in reversed(node_lines[:glosses_index]): 

290 template_data = [] 

291 template_depth = 0 

292 stripped = ( 

293 wxr.wtp.node_to_text(line, node_handler_fn=bold_node_handler_fn) 

294 .removeprefix(":") 

295 .strip() 

296 ) 

297 if not stripped: 

298 continue 

299 if not found_head and (parsed_forms := parse_head(wxr, stripped)): 299 ↛ 289line 299 didn't jump to line 289 because the condition on line 299 was always true

300 for form in parsed_forms: 

301 translate_raw_tags(form) 

302 

303 if ( 

304 data.lang_code == "el" 

305 # If there are spaces around the "/", we don't parse the 

306 # header correctly, so just skip the expansion. 

307 # Ex. "πρωτοπόρος, -α / -ος, -ο" 

308 # Remove this check if that ever gets fixed. 

309 and len(parsed_forms) == 3 

310 # Only adjectives or participles 

311 and ( 

312 data.pos == "adj" 

313 or (data.pos == "verb" and "participle" in data.tags) 

314 ) 

315 ): 

316 parsed_forms = expand_suffix_forms(parsed_forms) 

317 

318 parsed_forms = remove_article_forms(parsed_forms, data.word) 

319 data.forms.extend(parsed_forms) 

320 found_head = True 

321 

322 if not found_head: 322 ↛ 328line 322 didn't jump to line 328 because the condition on line 322 was never true

323 # There are a bunch of Greek Wiktionary articles with POS sections 

324 # without heads, but they seem to always follow ones with heads; 

325 # in this case, the result is just not including any `forms` field 

326 # for these (or copying the previous one). 

327 

328 if prev_data is None: 

329 wxr.wtp.wiki_notice( 

330 f"Part of speech missing head: {wxr.wtp.title}", 

331 sortid="pos/460/20250104", 

332 ) 

333 else: 

334 # No head found, copy previous (in this language) 

335 data.forms = [ 

336 form.model_copy(deep=True) for form in prev_data.forms 

337 ] 

338 

339 if len(template_data) > 0: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 data.head_templates = template_data 

341 # logger.info( 

342 # f" //// {wxr.wtp.title}\n >>>" 

343 # + "\n >>>".join(repr(td) for td in template_data) 

344 # ) 

345 

346 for template_name, table_node in table_nodes: 346 ↛ 348line 346 didn't jump to line 348 because the loop on line 346 never started

347 # XXX template_name 

348 parse_table( 

349 wxr, 

350 table_node, 

351 data, 

352 data.lang_code in GREEK_LANGCODES, 

353 template_name=template_name or "", 

354 source="inflection", 

355 ) 

356 

357 data.forms = remove_duplicate_forms(wxr, data.forms) 

358 

359 # Ignore images and files 

360 # 2025-01-17 13:10:10,035 INFO: //// Ηρόδοτος 

361 # //// [[Αρχείο:Herodotus Massimo Inv124478.jpg|200px|thumb|[[προτομή]] του Ηροδότου]] 

362 

363 # Have to ignore {{(( specifically. Creates columns. 

364 # 2025-01-17 13:10:11,059 INFO: //// κάνω 

365 # //// {{((|width=97%}} 

366 

367 # logger.info(f"<<<<<<<<< {wxr.wtp.title}\n<<<" + "\n<<<".join(pparts)) 

368 # see: free -> {{en-verb-'free'}} creates a floating inflection table 

369 # followed by the usual head template 

370 

371 # >>>>>>>>> free 

372 # >>>{{en-adj-r}} # floating table 

373 # >>>{{τ|en|{{PAGENAME}}}}, ''συγκριτικός'' '''freer''', ''υπερθετικός'' '''freest''' 

374 # pretty consistent bolding and italics 

375 

376 # genus 

377 # {{τ|la|{{PAGENAME}}}} {{ο}} ([[γενική]]: generis) (3ης [[κλίση]]ς) 

378 

379 # καφέ 

380 # >>>[[Αρχείο:Cafe Museum - Vienna (5402363918).jpg|μικρογραφία|τραπέζι σε βιενέζικο '''καφέ''']] 

381 # >>>'''{{PAGENAME}}''' {{ο}} {{ακλ|ουδ}} 

382 # Ignore images 

383 

384 # κρόκος 

385 # >>>{| align="right" 

386 # >>> 

387 # >>>|- 

388 # >>> 

389 # >>>|[[Αρχείο:Crocus sativus1.jpg|thumb|150px|Άνθη '''κρόκου''' (''Crocus sativus'').]] 

390 # >>> 

391 # >>> 

392 # >>>|[[Αρχείο:Raw egg.jpg|thumb|200px|Ο '''κρόκος''' ενός αβγού.]] 

393 # >>> 

394 # >>> 

395 # >>>|} 

396 # >>> 

397 # >>>'''{{PAGENAME}}''' {{α}} 

398 

399 # p 

400 # >>>'''{{PAGENAME}}''' ''πεζό'' (''κεφαλαίο:'' '''{{l|P|la}}''') 

401 # lowercase, uppercase 

402 

403 # Δημόκριτος 

404 # >>>'''{{PAGENAME}}''' 

405 # >>># {{όνομα||α}} 

406 # >>>{{clear}} 

407 # Clear is just formatting to move the line down where there are empty 

408 # margins. 

409 

410 # BIG ASSUMPTION: let us assume that Greek Wiktionary doesn't use templates 

411 # that generate multiline text that is part of head. That is, we can see 

412 # each newline because they are in strings, and when something that does 

413 # generate virtual newlines (list) pops up, that's when the head portion 

414 # ends. 

415 # Greek Wiktionary head sections look like this: 

416 # > Pre-head templates that create side-tables, like inflections 

417 # > Possible formatting templates like {{clear}} that should be ignored 

418 # > Head template last before glosses list 

419 # > Clear again... 

420 # > Glosses list tree, where we can stop. 

421 # We can create "lines" of these by looping over the items in pos_content 

422 # and looking for newlines in strings, because that's where they mainly 

423 # should be (except side-table templates). We handle earlier lines 

424 # differently than the last line before the glosses list, which is the 

425 # head. 

426 

427 # ====================== 

428 

429 ### Glosses after head ### 

430 got_senses = False 

431 for lst in glosses_lists: 

432 # Wiktionaries handle glosses the usual way: with numbered lists. 

433 # Each list entry is a gloss, sometimes with subglosses, but with 

434 # Simple English Wiktionary that seems rare. 

435 # logger.debug(f"{lst}") 

436 senses = recurse_glosses(wxr, lst, data) 

437 if len(senses) > 0: 437 ↛ 431line 437 didn't jump to line 431 because the condition on line 437 was always true

438 got_senses = True 

439 for sense in senses: 

440 translate_raw_tags(sense) 

441 data.senses.extend(senses) 

442 

443 if not got_senses and len(glosses_lists) > 0: 443 ↛ 444line 443 didn't jump to line 444 because the condition on line 443 was never true

444 wxr.wtp.error( 

445 "POS had a list, but the list did not return senses.", 

446 sortid="simple/pos/313", 

447 ) 

448 

449 # If there is no list, clump everything into one gloss. 

450 # if not len(glosses_lists > 0): 

451 # sense = Sense() 

452 # found_gloss = parse_gloss(wxr, sense, pos_contents[i:]) 

453 # if found_gloss is True or len(sense.raw_tags) > 0: 

454 # convert_tags_in_sense(sense) 

455 # if len(sense.glosses) == 0 and "no-gloss" not in sense.tags: 

456 # sense.tags.append("no-gloss") 

457 # data.senses.append(sense) 

458 

459 if len(data.senses) == 0: 459 ↛ 460line 459 didn't jump to line 460 because the condition on line 459 was never true

460 data.senses.append(Sense(tags=["no-gloss"])) 

461 

462 ##### 

463 ##### 

464 # TEMP DEBUG PRINTS 

465 

466 pos_sublevels = list( 

467 node.find_child(LEVEL_KIND_FLAGS) 

468 # include empty string only for debug printing? 

469 ) 

470 

471 for sl in pos_sublevels: 

472 subtitle = clean_node(wxr, None, sl.largs).lower().strip() 

473 

474 heading_type, *_ = parse_lower_heading(wxr, subtitle) 

475 

476 match heading_type: 

477 case Heading.Translations: 477 ↛ 478line 477 didn't jump to line 478 because the pattern on line 477 never matched

478 process_translations(wxr, data, sl) 

479 case Heading.Infl: 479 ↛ 480line 479 didn't jump to line 480 because the pattern on line 479 never matched

480 source: FormSource = "inflection" 

481 if data.lang_code in ("el", "grc"): 

482 source = "conjugation" 

483 process_inflection_section(wxr, data, sl, source=source) 

484 case ( 484 ↛ 471line 484 didn't jump to line 471 because the pattern on line 484 always matched

485 Heading.Related 

486 | Heading.Synonyms 

487 | Heading.Antonyms 

488 | Heading.Transliterations 

489 | Heading.AltOf 

490 | Heading.FormOf 

491 ): 

492 process_linkage_section(wxr, data, sl, heading_type) 

493 # if heading_type not in ( 

494 # Heading.Translations, 

495 # Heading.Ignored, 

496 # Heading.Infl, 

497 # Heading.Related, 

498 # Heading.Synonyms, 

499 # Heading.Antonyms, 

500 # Heading.Derived, 

501 # # We're going to ignore homonyms because they're 

502 # # only tangentially related, like anagrams 

503 # Heading.Homonyms, 

504 # ): 

505 # # ... 

506 # expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(sl)) 

507 # # text = clean_node(wxr, None, sl) 

508 # logger.warning( 

509 # f""" 

510 # {wxr.wtp.title}: {heading_type}, {ok=} 

511 # {expanded} 

512 

513 # ########################### 

514 # """ 

515 # ) 

516 

517 ##### 

518 ##### 

519 return data 

520 

521 

522PARENS_BEFORE_RE = re.compile(r"\s*(\([^()]+\)\s*)+") 

523ITER_PARENS_RE = re.compile(r"\(([^()]+)\)") 

524 

525 

526def bold_node_fn( 

527 node: WikiNode, 

528) -> list[str | WikiNode] | None: 

529 """Handle nodes in the parse tree specially.""" 

530 # print(f"{node=}") 

531 if node.kind == NodeKind.ITALIC: 

532 return ["__I__", *node.children, "__/I__"] 

533 if node.kind == NodeKind.BOLD: 

534 return ["__B__", *node.children, "__/B__"] 

535 # if node.kind == NodeKind.LINK: 

536 # if not isinstance(node.largs[0][0], str): 

537 # return None 

538 # return [ 

539 # "__L__", 

540 # # unpacking a list-comprehension, unpacking into a list 

541 # # seems to be more performant than adding lists together. 

542 # *( 

543 # wxr.wtp.node_to_text( 

544 # node.largs[1:2] or node.largs[0], 

545 # ) 

546 # # output the "visible" half of the link. 

547 # ), 

548 # # XXX collect link data if it turns out to be important. 

549 # "__/L__", 

550 # ] 

551 # # print(f"{node.largs=}") 

552 return None 

553 

554 

555def extract_alt_form_templates( 

556 wxr: WiktextractContext, 

557 parent_sense: Sense | WordEntry, 

558 t_node: TemplateNode, 

559 siblings: list[str | WikiNode], 

560 siblings_index: int, 

561) -> None: 

562 """Parse form_of or alt_of templates. 

563 

564 Supports: 

565 1. κλ | generic | form_of 

566 2. γρ | generic | form_of or alt_of 

567 3. πτώση/πτώσεις | nouns, adjectives etc. | form_of and tags 

568 4. υπο/υποκ | nouns | form_of 

569 5. μεγ/μεγεθ | nouns | form_of 

570 6. ρημ τύπος | verbs | form_of 

571 7. μτχ | verbs | form_of 

572 

573 * References: 

574 1. https://el.wiktionary.org/wiki/Πρότυπο:κλ 

575 2. https://el.wiktionary.org/wiki/Module:άλλημορφή 

576 3. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_κλιτικούς_τύπους 

577 4. https://el.wiktionary.org/wiki/Πρότυπο:υπο 

578 5. https://el.wiktionary.org/wiki/Πρότυπο:μεγ 

579 6. https://el.wiktionary.org/wiki/Πρότυπο:ρημ_τύπος 

580 7. https://el.wiktionary.org/wiki/Κατηγορία:Πρότυπα_για_μετοχές 

581 """ 

582 t_name = t_node.template_name 

583 t_args = t_node.template_parameters 

584 

585 basic_extract_form_of = partial( 

586 extract_form_of_templates_basic, 

587 wxr, 

588 parent_sense, 

589 siblings, 

590 siblings_index, 

591 t_name, 

592 t_node, 

593 ) 

594 # Generic 

595 if t_name == "κλ": 

596 return basic_extract_form_of(extract_argument=2) 

597 

598 # Generic 

599 # * Try parsing a "form_of" if the second template arguments refers to a 

600 # form (μορφ / μορφή / λόγια μορφή του, etc.). 

601 # * Otherwise, parse an "alt_of" 

602 # 

603 # Notes: 

604 # * All occurrences in wiktionary have at least one argument 

605 if t_name in ("γρ", "γραφή του", "alter") and 1 in t_args: 

606 if 2 in t_node.template_parameters: 

607 second_arg = t_node.template_parameters[2] 

608 if "μορφ" in clean_node(wxr, None, second_arg): 

609 return basic_extract_form_of(extract_argument=1) 

610 # We could add some tags here, but AltForm takes none 

611 word = clean_node(wxr, None, t_args[1]).strip() 

612 parent_sense.alt_of.append(AltForm(word=word)) 

613 return 

614 

615 # Nouns and adjectives 

616 if any(name in t_name for name in ("πτώσεις", "πτώση")) and 1 in t_args: 

617 return extract_form_of_templates_ptosi(wxr, parent_sense, t_node) 

618 

619 # Nouns 

620 # Note that the "diminutive/augmentative" tags will be added later on 

621 # via translation of the "υποκοριστικό/μεγεθυντικό" raw_tags 

622 if t_name in ("υπο", "υποκ", "μεγ", "μεγεθ") and 1 in t_args: 

623 return basic_extract_form_of(extract_argument=1) 

624 

625 # Verbs 

626 if t_name == "ρημ τύπος": 

627 return basic_extract_form_of(extract_argument=2) 

628 

629 if t_name.startswith("μτχ"): 

630 return basic_extract_form_of(extract_argument=1) 

631 

632 

633def extract_form_of_templates_basic( 

634 wxr: WiktextractContext, 

635 parent_sense: Sense | WordEntry, 

636 siblings: list[str | WikiNode], 

637 sibling_index: int, 

638 t_name: str, 

639 t_node: TemplateNode, 

640 extract_argument: int | str, 

641) -> None: 

642 t_args = t_node.template_parameters 

643 if extract_argument in t_args: 

644 lemma = clean_node(wxr, None, t_args[extract_argument]).strip() 

645 else: 

646 # mtxpp template has no args, consume the next links for the 

647 # form_of field 

648 # cf. https://github.com/tatuylonen/wiktextract/issues/1372 

649 wxr.wtp.wiki_notice( 

650 f"Form-of template does not have lemma data: {t_name}, {t_args=}", 

651 sortid="pos/570/20250517", 

652 ) 

653 links: list[str | WikiNode] = [] 

654 for node in siblings[sibling_index + 1 :]: 

655 if not ( 

656 (isinstance(node, str) and node.strip() == "") 

657 or (isinstance(node, WikiNode) and node.kind == NodeKind.LINK) 

658 ): 

659 break 

660 links.append(node) 

661 lemma = clean_node(wxr, None, links).strip() 

662 

663 if lemma: 

664 form_of = AltForm(word=lemma) 

665 parent_sense.form_of.append(form_of) 

666 else: 

667 wxr.wtp.wiki_notice( 

668 "Lemma extract from form-of template was empty or whitespace:" 

669 f"{t_name}, {t_args=}, {lemma=}", 

670 sortid="pos/609/20250925", 

671 ) 

672 

673 

674PTOSI_GENDER_INFLECTION_MAP = { 

675 "θηλ": "feminine", 

676 "αρσ": "masculine", 

677 "ουδ": "neuter", 

678} 

679PTOSI_NUMBER_INFLECTION_MAP = { 

680 "εν": "singular", 

681 "πλ": "plural", 

682} 

683PTOSI_CASE_INFLECTION_MAP = { 

684 "Ο": "nominative", 

685 "Α": "accusative", 

686 "Γ": "genitive", 

687 "Κ": "vocative", 

688} 

689 

690 

691def extract_form_of_templates_ptosi( 

692 wxr: WiktextractContext, 

693 parent_sense: Sense | WordEntry, 

694 t_node: TemplateNode, 

695) -> None: 

696 """Parse form_of for nouns and adjectives. 

697 

698 Supports: 

699 * [gender του] πτώση-πτώσεις templates 

700 

701 Notes: 

702 * The πτώση-πτώσεις templates contains: 

703 * Case(s): 1 for πτώση, >1 for πτώσεις - in uppercase characters. 

704 * Number: "εν" (singular) or "πλ" (plural) 

705 Examples: 

706 * {{πτώσηΑεν|κόρφος}} > accusative | singular 

707 * {{πτώσειςΟΚπλ|κόρφος}} > nominative, vocative | plural 

708 """ 

709 t_name = t_node.template_name 

710 inflection_t_names = ("πτώσεις", "πτώση") 

711 tags: list[str] = [] 

712 

713 # Parse and consume gender if any 

714 if "-" in t_name: 

715 # Cf. {{ουδ του-πτώσειςΟΑΚεν|καλός|grc}} 

716 gender, inflection = t_name.split("-") 

717 code = gender[:3] 

718 try: 

719 gender_tag = PTOSI_GENDER_INFLECTION_MAP[code] 

720 except KeyError: 

721 # Bad template name. 

722 return 

723 tags.append(gender_tag) 

724 else: 

725 inflection = t_name 

726 

727 # Remove πτώση-πτώσεις prefix 

728 for prefix in inflection_t_names: 728 ↛ 733line 728 didn't jump to line 733 because the loop on line 728 didn't complete

729 if inflection.startswith(prefix): 

730 inflection = inflection[len(prefix) :] 

731 break 

732 

733 try: 

734 lowercase = "".join(ch for ch in inflection if ch.islower()) 

735 number = PTOSI_NUMBER_INFLECTION_MAP[lowercase] 

736 uppercase = [ch for ch in inflection if not ch.islower()] 

737 cases = [PTOSI_CASE_INFLECTION_MAP[ch] for ch in uppercase] 

738 except KeyError: 

739 # Bad template name. 

740 return 

741 

742 tags.extend([*cases, number]) 

743 tags.sort() # For the tests, but also good practice 

744 

745 lemma = clean_node(wxr, None, t_node.template_parameters[1]) 

746 form_of = AltForm(word=lemma) 

747 parent_sense.form_of.append(form_of) 

748 parent_sense.tags.extend(tags) 

749 

750 

751def parse_gloss( 

752 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode] 

753) -> bool: 

754 """Take what is preferably a line of text and extract tags and a gloss from 

755 it. The data is inserted into parent_sense, and for recursion purposes 

756 we return a boolean that tells whether there was any gloss text in a 

757 lower node.""" 

758 if len(contents) == 0: 758 ↛ 759line 758 didn't jump to line 759 because the condition on line 758 was never true

759 return False 

760 

761 for i, t_node in enumerate(contents): 

762 if isinstance(t_node, TemplateNode): 

763 extract_alt_form_templates(wxr, parent_sense, t_node, contents, i) 

764 

765 template_tags: list[str] = [] 

766 

767 bl_linkages: list[Linkage] = [] 

768 no_gloss_but_keep_anyway = False 

769 

770 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None: 

771 nonlocal bl_linkages 

772 if name == "βλ": 

773 for k, v in ht.items(): 

774 if isinstance(k, int): 

775 bl_linkages.append(Linkage(word=clean_node(wxr, None, v))) 

776 return "" 

777 return None 

778 

779 # The rest of the text. 

780 text = clean_node( 

781 wxr, 

782 parent_sense, 

783 contents, 

784 template_fn=bl_template_handler_fn, 

785 node_handler_fn=bold_node_fn, 

786 ) 

787 

788 if len(bl_linkages) > 0: 

789 parent_sense.related.extend(bl_linkages) 

790 no_gloss_but_keep_anyway = True 

791 

792 if not text.strip(): 

793 if len(bl_linkages) <= 0: 793 ↛ 794line 793 didn't jump to line 794 because the condition on line 793 was never true

794 return False 

795 

796 # print(f" ============ {contents=}, {text=}") 

797 

798 # Greek Wiktionary uses a lot of template-less tags. 

799 if parens_n := PARENS_BEFORE_RE.match(text): 

800 blocks = ITER_PARENS_RE.findall(parens_n.group(0)) 

801 # print(f"{blocks=}") 

802 kept_blocks: list[str] = [] 

803 forms: list[str] = [] 

804 raw_tag_texts: list[str] = [] 

805 for block in blocks: 

806 if block_has_non_greek_text(block): 

807 # Keep parentheses with non-greek text with gloss text) 

808 kept_blocks.extend(("(", block, ") ")) 

809 continue 

810 nforms, nraw_tag_texts = extract_forms_and_tags(block) 

811 forms.extend(nforms) 

812 raw_tag_texts.extend(nraw_tag_texts) 

813 # print(f"{forms=}, {raw_tag_texts=}") 

814 if forms: 814 ↛ 816line 814 didn't jump to line 816 because the condition on line 814 was never true

815 # print(f"{forms=}") 

816 parent_sense.related.extend(Linkage(word=form) for form in forms) 

817 parent_sense.raw_tags.extend(raw_tag_texts) 

818 kept_blocks.append(text[parens_n.end() :]) 

819 text = "".join(kept_blocks) 

820 

821 text = re.sub(r"__/?[IB]__", "", text) 

822 

823 if len(template_tags) > 0: 823 ↛ 824line 823 didn't jump to line 824 because the condition on line 823 was never true

824 parent_sense.raw_tags.extend(template_tags) 

825 

826 if len(text) > 0: 

827 parent_sense.glosses.append(text) 

828 return True 

829 

830 if no_gloss_but_keep_anyway: 830 ↛ 834line 830 didn't jump to line 834 because the condition on line 830 was always true

831 parent_sense.tags.append("no-gloss") 

832 return True 

833 

834 return False 

835 

836 

837Related: TypeAlias = Linkage 

838Synonym: TypeAlias = Linkage 

839Antonym: TypeAlias = Linkage 

840 

841 

842def recurse_glosses1( 

843 wxr: WiktextractContext, 

844 parent_sense: Sense, 

845 node: WikiNode, 

846) -> tuple[ 

847 list[Sense], 

848 list[Example], 

849 list[Related], 

850 list[Synonym], 

851 list[Antonym], 

852]: 

853 """Helper function for recurse_glosses""" 

854 # print(f"{node=}") 

855 

856 ret_senses: list[Sense] = [] 

857 ret_examples: list[Example] = [] 

858 ret_related: list[Related] = [] 

859 ret_synonyms: list[Synonym] = [] 

860 ret_antonyms: list[Antonym] = [] 

861 found_gloss = False 

862 

863 # Pydantic stuff doesn't play nice with Tatu's manual dict manipulation 

864 # functions, so we'll use a dummy dict here that we then check for 

865 # content and apply to `parent_sense`. 

866 dummy_parent: dict[str, Any] = {} 

867 

868 related_linkages: list[Linkage] = [] 

869 example_is_synonym = False 

870 example_is_antonym = False 

871 

872 def bl_template_handler_fn(name: str, ht: TemplateArgs) -> str | None: 

873 nonlocal related_linkages 

874 nonlocal example_is_synonym 

875 nonlocal example_is_antonym 

876 # Sometimes the bl-templates point to synonyms or antonyms, instead 

877 # of just "related"; we save them, and if example_is_xxxnym is true, 

878 # we later return them as xxxnyms. 

879 if name == "βλ": 

880 for k, v in ht.items(): 

881 if isinstance(k, int): 

882 related_linkages.append( 

883 Linkage(word=clean_node(wxr, None, v)) 

884 ) 

885 return "" 

886 if name in ("συνων", "συνών"): 

887 example_is_synonym = True 

888 return "" 

889 if name in ("αντων", "αντών"): 

890 example_is_antonym = True 

891 return "" 

892 return None 

893 

894 # List nodes contain only LIST_ITEM nodes, which may contain sub-LIST nodes. 

895 if node.kind == NodeKind.LIST: 

896 list_ret: tuple[ 

897 list[Sense], 

898 list[Example], 

899 list[Related], 

900 list[Synonym], 

901 list[Antonym], 

902 ] = ([], [], [], [], []) 

903 for child in node.children: 

904 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 904 ↛ 906line 904 didn't jump to line 906 because the condition on line 904 was never true

905 # This should never happen 

906 wxr.wtp.error( 

907 f"{child=} is direct child of NodeKind.LIST", 

908 sortid="simple/pos/44", 

909 ) 

910 continue 

911 ( 

912 senses, 

913 examples, 

914 related, 

915 synonyms, 

916 antonyms, 

917 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child) 

918 list_ret[0].extend(senses) 

919 list_ret[1].extend(examples) 

920 list_ret[2].extend(related) 

921 list_ret[3].extend(synonyms) 

922 list_ret[4].extend(antonyms) 

923 return list_ret 

924 

925 elif node.kind == NodeKind.LIST_ITEM: 925 ↛ 1027line 925 didn't jump to line 1027 because the condition on line 925 was always true

926 # Split at first LIST node found 

927 split_at = next( 

928 ( 

929 i 

930 for i, c in enumerate(node.children) 

931 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST 

932 ), 

933 len(node.children), 

934 ) 

935 contents = node.children[:split_at] 

936 sublists = node.children[split_at:] 

937 

938 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like 

939 # `#` or `##:`: the token that appears at the very start of a line that 

940 # is used to parse the depth and structure of lists. 

941 # `#` Item 1 

942 # `##` Item 1.1 

943 # `##*` Example 1.1 

944 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"): 

945 # This is either a quotation or example. 

946 text = clean_node( 

947 wxr, dummy_parent, contents, template_fn=bl_template_handler_fn 

948 ).strip("⮡ \n") 

949 

950 # print(f"{contents=}, {text=}, {related_linkages=}") 

951 

952 if example_is_synonym or example_is_antonym: 

953 link_linkages = [] 

954 for snode in contents: 

955 if not isinstance(snode, WikiNode): 

956 continue 

957 if snode.kind == NodeKind.LINK: 

958 link_linkages.append( 

959 Linkage( 

960 word=clean_node(wxr, None, snode.largs[0][0]) 

961 ) 

962 ) 

963 else: 

964 for link in snode.find_child_recursively(NodeKind.LINK): 964 ↛ 965line 964 didn't jump to line 965 because the loop on line 964 never started

965 link_linkages.append( 

966 Linkage(word=clean_node(wxr, None, link)) 

967 ) 

968 

969 # print("=====") 

970 # print(f"{link_linkages=}") 

971 

972 if example_is_synonym: 

973 return [], [], [], link_linkages + related_linkages, [] 

974 elif example_is_antonym: 974 ↛ 977line 974 didn't jump to line 977 because the condition on line 974 was always true

975 return [], [], [], [], link_linkages + related_linkages 

976 

977 if len(related_linkages) > 0: 

978 # parent_sense.related.extend(bl_linkages) 

979 # related_linkages = [] 

980 # if not text.strip(): 

981 return [], [], related_linkages, [], [] 

982 

983 example_is_synonym = False 

984 example_is_antonym = False 

985 

986 if not text.strip(): 986 ↛ 987line 986 didn't jump to line 987 because the condition on line 986 was never true

987 return [], [], [], [], [] 

988 

989 example = Example(text=text) 

990 # logger.debug(f"{wxr.wtp.title}/example\n{text}") 

991 if len(sublists) > 0: 

992 translation = clean_node(wxr, dummy_parent, sublists).strip( 

993 "#*: \n" 

994 ) 

995 if translation != "": 995 ↛ 998line 995 didn't jump to line 998 because the condition on line 995 was always true

996 example.translation = translation 

997 

998 for k, v in dummy_parent.items(): 998 ↛ 999line 998 didn't jump to line 999 because the loop on line 998 never started

999 if k == "categories": 

1000 parent_sense.categories.extend(v) 

1001 dummy_parent = {} 

1002 

1003 return [], [example], [], [], [] 

1004 

1005 found_gloss = parse_gloss(wxr, parent_sense, contents) 

1006 

1007 for sl in sublists: 

1008 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 1008 ↛ 1010line 1008 didn't jump to line 1010 because the condition on line 1008 was never true

1009 # Should not happen 

1010 wxr.wtp.error( 

1011 f"Sublist is not NodeKind.LIST: {sublists=!r}", 

1012 sortid="simple/pos/82", 

1013 ) 

1014 continue 

1015 ( 

1016 senses, 

1017 examples, 

1018 related, 

1019 synonyms, 

1020 antonyms, 

1021 ) = recurse_glosses1(wxr, parent_sense.model_copy(deep=True), sl) 

1022 ret_senses.extend(senses) 

1023 ret_examples.extend(examples) 

1024 ret_related.extend(related) 

1025 ret_synonyms.extend(synonyms) 

1026 ret_antonyms.extend(antonyms) 

1027 if len(ret_senses) > 0: 

1028 # the recursion returned actual senses from below, so we will 

1029 # ignore everything else (incl. any example data that might have 

1030 # been given to parent_sense) and return that instead. 

1031 # XXX if this becomes relevant, add the example data to a returned 

1032 # subsense instead? 

1033 # if any( 

1034 # isinstance(r, Sense) and r.tags == ["no-gloss"] for r in ret 

1035 # ): 

1036 # print(f"{ret=}") 

1037 return ( 

1038 combine_senses_with_identical_glosses(ret_senses), 

1039 [], 

1040 [], 

1041 [], 

1042 [], 

1043 ) 

1044 

1045 # If nothing came from below, then this. 

1046 if found_gloss is True or "no-gloss" in parent_sense.tags: 1046 ↛ 1054line 1046 didn't jump to line 1054 because the condition on line 1046 was always true

1047 parent_sense.examples.extend(ret_examples) 

1048 parent_sense.related.extend(ret_related) 

1049 parent_sense.synonyms.extend(ret_synonyms) 

1050 parent_sense.antonyms.extend(ret_antonyms) 

1051 

1052 return [parent_sense], [], [], [], [] 

1053 

1054 return [], ret_examples, ret_related, ret_synonyms, ret_antonyms 

1055 

1056 

1057def recurse_glosses( 

1058 wxr: WiktextractContext, node: WikiNode, data: WordEntry 

1059) -> list[Sense]: 

1060 """Recurse through WikiNodes to find glosses and sense-related data.""" 

1061 base_sense = Sense() 

1062 ret: list[Sense] = [] 

1063 

1064 senses, examples, related, synonyms, antonyms = recurse_glosses1( 

1065 wxr, base_sense, node 

1066 ) 

1067 if ( 1067 ↛ 1073line 1067 didn't jump to line 1073 because the condition on line 1067 was never true

1068 len(examples) > 0 

1069 or len(related) > 0 

1070 or len(synonyms) > 0 

1071 or len(antonyms) > 0 

1072 ): 

1073 wxr.wtp.error( 

1074 "NOT Sense has bubbled to recurse_glosses: " 

1075 f"{examples=}, {related=}, {synonyms=}, {antonyms=}", 

1076 sortid="pos/glosses/966", 

1077 ) 

1078 for sense in senses: 

1079 convert_tags_in_sense(sense) 

1080 ret.append(sense) 

1081 

1082 return ret 

1083 

1084 

1085def split_nodes_to_lines( 

1086 nodes: list[WikiNode | str], 

1087) -> Iterator[list[WikiNode | str]]: 

1088 """Take a list of nodes and split up the list into lines. 

1089 This could be done by using node_to_wikitext() to reverse the parsing, 

1090 and then you could parse the individual lines after splitting the text, 

1091 but it seems unnecessary in the context of Greek Wiktionary PoS sections. 

1092 """ 

1093 parts: list[WikiNode | str] = [] 

1094 for node in nodes: 

1095 if isinstance(node, WikiNode): 

1096 # Lists are returned as whole, they're their own line 

1097 if node.kind == NodeKind.LIST: 

1098 if len(parts) > 0: 1098 ↛ 1099line 1098 didn't jump to line 1099 because the condition on line 1098 was never true

1099 yield parts 

1100 parts = [] 

1101 yield [node] 

1102 continue 

1103 if isinstance(node, TemplateNode) and node.template_name in ( 1103 ↛ 1110line 1103 didn't jump to line 1110 because the condition on line 1103 was never true

1104 # Ignore specific templates, like {{((}} that bookends a column. 

1105 "((", 

1106 "))", 

1107 "clear", 

1108 "κλείδα-ελλ", 

1109 ): 

1110 continue 

1111 parts.append(node) 

1112 else: 

1113 if "\n" in node: 

1114 split_string = node.splitlines() 

1115 for spl in split_string[:-1]: 

1116 if spl: 1116 ↛ 1117line 1116 didn't jump to line 1117 because the condition on line 1116 was never true

1117 parts.append(spl) 

1118 yield parts 

1119 parts = [] 

1120 # special handling for final newline; splitlines ignores it 

1121 if node.endswith("\n"): 

1122 if split_string[-1]: 

1123 parts.append(split_string[-1]) 

1124 yield parts 

1125 parts = [] 

1126 elif split_string[-1]: 1126 ↛ 1094line 1126 didn't jump to line 1094 because the condition on line 1126 was always true

1127 parts.append(split_string[-1]) 

1128 elif node: 1128 ↛ 1094line 1128 didn't jump to line 1094 because the condition on line 1128 was always true

1129 parts.append(node) 

1130 

1131 # yield final parts 

1132 if len(parts) > 0: 1132 ↛ 1133line 1132 didn't jump to line 1133 because the condition on line 1132 was never true

1133 yield parts 

1134 

1135 

1136BOLD_RE = re.compile(r"(__/?[BI]__|, |\. )") 

1137 

1138 

1139def extract_forms_and_tags(tagged_text: str) -> tuple[list[str], list[str]]: 

1140 forms: list[str] = [] 

1141 tags: list[str] = [] 

1142 

1143 # print(f"{tagged_text=}") 

1144 # inside_italics = False 

1145 inside_bold = False 

1146 

1147 for i, t in enumerate(BOLD_RE.split(tagged_text)): 

1148 t = t.strip() 

1149 # print(f"{i}: {t=}") 

1150 if not t: 

1151 continue 

1152 

1153 if i % 2 == 0: 

1154 # Text between splitters 

1155 if inside_bold is True: 1155 ↛ 1156line 1155 didn't jump to line 1156 because the condition on line 1155 was never true

1156 forms.append(t) 

1157 continue 

1158 # Add everything else to raw_tags 

1159 # if inside_italics is True: 

1160 # tags.append(t) 

1161 # continue 

1162 # ". " and ", " just split. They're stripped to "." and "," if 

1163 # this needs to be modified later. 

1164 tags.append(t) 

1165 continue 

1166 match t: 

1167 case "__B__": 1167 ↛ 1168line 1167 didn't jump to line 1168 because the pattern on line 1167 never matched

1168 inside_bold = True 

1169 case "__/B__": 1169 ↛ 1170line 1169 didn't jump to line 1170 because the pattern on line 1169 never matched

1170 inside_bold = False 

1171 # case "__I__": 

1172 # inside_italics = True 

1173 # case "__/I__": 

1174 # inside_italics = False 

1175 

1176 return forms, tags 

1177 

1178 

1179META_RE = re.compile(r"__/?[ILEB]__") 

1180 

1181 

1182def block_has_non_greek_text(text: str) -> bool: 

1183 text = META_RE.sub("", text) 

1184 for t in text.split(): 

1185 for ch in t: 1185 ↛ 1184line 1185 didn't jump to line 1184 because the loop on line 1185 didn't complete

1186 if not ch.isalpha(): 1186 ↛ 1187line 1186 didn't jump to line 1187 because the condition on line 1186 was never true

1187 continue 

1188 if not unicode_name(ch).startswith("GREEK"): 

1189 return True 

1190 break 

1191 return False 

1192 

1193 

1194def combine_senses_with_identical_glosses( 

1195 orig_senses: list[Sense], 

1196) -> list[Sense]: 

1197 glosses_to_senses: dict[tuple[str, ...], list[Sense]] = {} 

1198 senses: list[Sense] = [] 

1199 

1200 found_identical_glosses = False 

1201 

1202 for item in orig_senses: 

1203 glosses_key = tuple(item.glosses) 

1204 if glosses_key not in glosses_to_senses: 1204 ↛ 1207line 1204 didn't jump to line 1207 because the condition on line 1204 was always true

1205 glosses_to_senses[glosses_key] = [item] 

1206 else: 

1207 glosses_to_senses[glosses_key].append(item) 

1208 found_identical_glosses = True 

1209 

1210 if not found_identical_glosses: 1210 ↛ 1213line 1210 didn't jump to line 1213 because the condition on line 1210 was always true

1211 return orig_senses 

1212 

1213 for twinned_senses in glosses_to_senses.values(): 

1214 main_sense = twinned_senses[0] 

1215 for other_sense in twinned_senses[1:]: 

1216 main_sense.merge(other_sense) 

1217 senses.append(main_sense) 

1218 

1219 return senses