Coverage for src/wiktextract/extractor/ku/pos.py: 78%

218 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2from itertools import count 

3 

4from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .example import extract_example_list_item 

9from .form_table import ( 

10 extract_ku_tewîn_lk_template, 

11 extract_ku_tewîn_nav_template, 

12) 

13from .models import AltForm, Form, Sense, WordEntry 

14from .section_titles import POS_DATA 

15from .tags import TAGS, translate_raw_tags 

16 

17FORM_OF_TEMPLATES = frozenset( 

18 [ 

19 "formeke peyvê", 

20 "inflection of", 

21 "dem2", 

22 "guherto", 

23 "guharto", 

24 "rastnivîs", 

25 "şaşnivîs", 

26 "şaşî", 

27 "kevnbûyî", 

28 "binêre", 

29 "bnr", 

30 "binêre2", 

31 "bnr2", 

32 "awayekî din", 

33 "ad", 

34 "komparatîv", 

35 "kom", 

36 "sûperlatîv", 

37 "sûp", 

38 "dem", 

39 "dema-bê", 

40 "dema-fireh", 

41 "raboriya-sade", 

42 "rehê dema niha", 

43 ] 

44) 

45FORM_OF_TEMPLATE_SUFFIXES = ( 

46 "-dema-bê", 

47 "-dema-bê-p", 

48 "-dema-niha", 

49 "-dema-niha-p", 

50 "-fermanî", 

51) 

52 

53 

54def extract_pos_section( 

55 wxr: WiktextractContext, 

56 page_data: list[WordEntry], 

57 base_data: WordEntry, 

58 level_node: LevelNode, 

59 pos_title: str, 

60) -> None: 

61 page_data.append(base_data.model_copy(deep=True)) 

62 page_data[-1].pos_title = pos_title 

63 pos_data = POS_DATA[pos_title] 

64 page_data[-1].pos = pos_data["pos"] 

65 page_data[-1].tags.extend(pos_data.get("tags", [])) 

66 

67 gloss_list_index = len(level_node.children) 

68 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

69 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

70 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 70 ↛ 69line 70 didn't jump to line 69 because the condition on line 70 was always true

71 extract_gloss_list_item(wxr, page_data[-1], list_item) 

72 if index < gloss_list_index: 72 ↛ 69line 72 didn't jump to line 69 because the condition on line 72 was always true

73 gloss_list_index = index 

74 

75 extract_pos_header_nodes( 

76 wxr, page_data[-1], level_node.children[:gloss_list_index] 

77 ) 

78 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

79 if t_node.template_name == "binêre/el": 

80 extract_binêre_el_template(wxr, page_data[-1], t_node) 

81 elif ( 

82 t_node.template_name in FORM_OF_TEMPLATES 

83 or t_node.template_name.endswith(FORM_OF_TEMPLATE_SUFFIXES) 

84 ): 

85 sense = Sense() 

86 extract_form_of_template(wxr, sense, t_node) 

87 gloss = clean_node(wxr, sense, t_node) 

88 if gloss != "": 88 ↛ 90line 88 didn't jump to line 90 because the condition on line 88 was always true

89 sense.glosses.append(gloss) 

90 page_data[-1].senses.append(sense) 

91 

92 

93def extract_gloss_list_item( 

94 wxr: WiktextractContext, 

95 word_entry: WordEntry, 

96 list_item: WikiNode, 

97 parent_sense: Sense | None = None, 

98) -> None: 

99 sense = ( 

100 parent_sense.model_copy(deep=True) 

101 if parent_sense is not None 

102 else Sense() 

103 ) 

104 gloss_nodes = [] 

105 for node in list_item.children: 

106 if isinstance(node, TemplateNode): 

107 if node.template_name in ["f", "ferhengok"]: 

108 extract_ferhengok_template(wxr, sense, node) 

109 elif ( 109 ↛ 115line 109 didn't jump to line 115 because the condition on line 109 was always true

110 node.template_name in FORM_OF_TEMPLATES 

111 or node.template_name.endswith(FORM_OF_TEMPLATE_SUFFIXES) 

112 ): 

113 extract_form_of_template(wxr, sense, node) 

114 gloss_nodes.append(node) 

115 elif node.template_name in ["bajar"]: 

116 clean_node(wxr, sense, node) 

117 sense.topics.append("city") 

118 else: 

119 t_node_text = clean_node(wxr, sense, node) 

120 if t_node_text.startswith("(") and t_node_text.endswith(")"): 

121 sense.raw_tags.append(t_node_text.strip("() ")) 

122 else: 

123 gloss_nodes.append(t_node_text) 

124 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

125 gloss_nodes.append(node) 

126 

127 gloss_str = clean_node(wxr, sense, gloss_nodes) 

128 if gloss_str != "": 128 ↛ 133line 128 didn't jump to line 133 because the condition on line 128 was always true

129 sense.glosses.append(gloss_str) 

130 translate_raw_tags(sense) 

131 word_entry.senses.append(sense) 

132 

133 for child_list in list_item.find_child(NodeKind.LIST): 

134 if child_list.sarg.startswith("#") and child_list.sarg.endswith( 

135 (":", "*") 

136 ): 

137 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

138 extract_example_list_item(wxr, word_entry, sense, e_list_item) 

139 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 139 ↛ 133line 139 didn't jump to line 133 because the condition on line 139 was always true

140 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

141 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

142 

143 if len(sense.glosses) == 0 and len(sense.examples) > 0: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 word_entry.senses.append(sense) 

145 

146 

147def extract_ferhengok_template( 

148 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

149) -> None: 

150 # https://ku.wiktionary.org/wiki/Şablon:ferhengok 

151 node_str = clean_node(wxr, sense, t_node).strip("() ") 

152 for raw_tag in re.split(r",| an | û ", node_str): 

153 raw_tag = raw_tag.strip() 

154 if raw_tag != "": 154 ↛ 152line 154 didn't jump to line 152 because the condition on line 154 was always true

155 sense.raw_tags.append(raw_tag) 

156 

157 

158# https://ku.wiktionary.org/wiki/Alîkarî:Cureyên_peyvan 

159POS_HEADER_TEMPLATES = frozenset( 

160 [ 

161 "navdêr", 

162 "serenav", 

163 "lêker", 

164 "rengdêr", 

165 "hoker", 

166 "cînav", 

167 "baneşan", 

168 "daçek", 

169 "pêşdaçek", 

170 "paşdaçek", 

171 "bazinedaçek", 

172 "girêdek", 

173 "artîkel", 

174 "pirtik", 

175 "navgir", 

176 "paşgir", 

177 "pêşgir", 

178 "reh", 

179 "biwêj", 

180 "hevok", 

181 "gp", 

182 "hejmar", 

183 "tîp", 

184 "sembol", 

185 "kurtenav", 

186 ] 

187) 

188 

189 

190def extract_pos_header_nodes( 

191 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

192) -> None: 

193 for node in nodes: 

194 if ( 

195 isinstance(node, TemplateNode) 

196 and node.template_name in POS_HEADER_TEMPLATES 

197 ): 

198 form = Form( 

199 form=clean_node( 

200 wxr, None, node.template_parameters.get("tr", "") 

201 ), 

202 tags=["romanization"], 

203 ) 

204 if form.form not in ["", "-"]: 

205 word_entry.forms.append(form) 

206 clean_node(wxr, word_entry, node) 

207 if isinstance(node, TemplateNode) and node.template_name in [ 

208 "navdêr", 

209 "serenav", 

210 ]: 

211 extract_navdêr_template(wxr, word_entry, node) 

212 elif isinstance(node, TemplateNode) and node.template_name == "lêker": 

213 extract_lêker_template(wxr, word_entry, node) 

214 elif isinstance(node, TemplateNode) and node.template_name in [ 

215 "ku-tewîn-nav", 

216 "ku-tew-nav", 

217 "ku-tewîn-rd", 

218 ]: 

219 extract_ku_tewîn_nav_template(wxr, word_entry, node) 

220 elif ( 

221 isinstance(node, TemplateNode) 

222 and node.template_name == "ku-tewîn-lk" 

223 ): 

224 extract_ku_tewîn_lk_template(wxr, word_entry, node) 

225 

226 

227def extract_navdêr_template( 

228 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

229) -> None: 

230 # https://ku.wiktionary.org/wiki/Şablon:navdêr 

231 # Şablon:serenav 

232 GENDERS = { 

233 "n": "masculine", 

234 "n+": "masculine-usually", 

235 "m": "feminine", 

236 "m+": "feminine-usually", 

237 "nt": "gender-neutral", 

238 "mn": ["feminine", "masculine"], 

239 "m/n": ["feminine", "masculine"], 

240 "g": "common-gender", 

241 } 

242 z_arg = clean_node(wxr, None, t_node.template_parameters.get("z", "")) 

243 if z_arg in GENDERS: 

244 tag = GENDERS[z_arg] 

245 if isinstance(tag, str): 245 ↛ 247line 245 didn't jump to line 247 because the condition on line 245 was always true

246 word_entry.tags.append(tag) 

247 elif isinstance(tag, list): 

248 word_entry.tags.extend(tag) 

249 NUMBERS = { 

250 "p": "plural", 

251 "p+": "plural-normally", 

252 "tp": "plural-only", 

253 "y": "singular", 

254 "nj": "uncountable", 

255 "j/nj": ["countable", "uncountable"], 

256 } 

257 j_arg = clean_node(wxr, None, t_node.template_parameters.get("j", "")) 

258 if j_arg in NUMBERS: 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true

259 tag = NUMBERS[j_arg] 

260 if isinstance(tag, str): 

261 word_entry.tags.append(tag) 

262 elif isinstance(tag, list): 

263 word_entry.tags.extend(tag) 

264 

265 FORMS = { 

266 "m": "feminine", 

267 "n": "masculine", 

268 "nt": "gender-neutral", 

269 "y": "singular", 

270 "p": "plural", 

271 "np": ["masculine", "plural"], 

272 "mp": ["feminine", "plural"], 

273 "lk": "verb-from-noun", 

274 "hanja": "Hanja", 

275 } 

276 for form_arg, tag in FORMS.items(): 

277 if form_arg not in t_node.template_parameters: 

278 continue 

279 extract_navdêr_template_form(wxr, word_entry, t_node, form_arg, tag) 

280 for index in count(2): 280 ↛ 276line 280 didn't jump to line 276 because the loop on line 280 didn't complete

281 form_arg += str(index) 

282 if form_arg not in t_node.template_parameters: 282 ↛ 284line 282 didn't jump to line 284 because the condition on line 282 was always true

283 break 

284 extract_navdêr_template_form(wxr, word_entry, t_node, form_arg, tag) 

285 

286 expanded_node = wxr.wtp.parse( 

287 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

288 ) 

289 for i_tag in expanded_node.find_html_recursively("i"): 

290 i_text = clean_node(wxr, None, i_tag) 

291 if i_text.startswith("(") and i_text.endswith(")"): 

292 word_entry.forms.append( 

293 Form(form=i_text.strip("() "), tags=["romanization"]) 

294 ) 

295 for main_span_tag in expanded_node.find_html( 

296 "span", attr_name="class", attr_value="headword-line" 

297 ): 

298 for strong_tag in main_span_tag.find_html( 

299 "strong", attr_name="class", attr_value="headword" 

300 ): 

301 strong_str = clean_node(wxr, None, strong_tag) 

302 if strong_str not in ["", wxr.wtp.title]: 302 ↛ 298line 302 didn't jump to line 298 because the condition on line 302 was always true

303 word_entry.forms.append( 

304 Form(form=strong_str, tags=["canonical"]) 

305 ) 

306 for roman_span in main_span_tag.find_html( 

307 "span", attr_name="class", attr_value="headword-tr" 

308 ): 

309 roman = clean_node(wxr, None, roman_span) 

310 if roman != "": 310 ↛ 306line 310 didn't jump to line 306 because the condition on line 310 was always true

311 word_entry.forms.append( 

312 Form(form=roman, tags=["transliteration"]) 

313 ) 

314 

315 clean_node(wxr, word_entry, expanded_node) 

316 

317 

318def extract_navdêr_template_form( 

319 wxr: WiktextractContext, 

320 word_entry: WordEntry, 

321 t_node: TemplateNode, 

322 arg_name: str, 

323 tag: str | list[str], 

324) -> None: 

325 if arg_name not in t_node.template_parameters: 325 ↛ 326line 325 didn't jump to line 326 because the condition on line 325 was never true

326 return 

327 form = Form( 

328 form=clean_node(wxr, None, t_node.template_parameters[arg_name]) 

329 ) 

330 if isinstance(tag, str): 330 ↛ 332line 330 didn't jump to line 332 because the condition on line 330 was always true

331 form.tags.append(tag) 

332 elif isinstance(tag, list): 

333 form.tags.extend(tag) 

334 if form.form != "": 334 ↛ exitline 334 didn't return from function 'extract_navdêr_template_form' because the condition on line 334 was always true

335 word_entry.forms.append(form) 

336 

337 

338def extract_lêker_template( 

339 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

340) -> None: 

341 # https://ku.wiktionary.org/wiki/Şablon:lêker 

342 TAGS = { 

343 "gh": "transitive", 

344 "ngh": "intransitive", 

345 "x": "proper-noun", 

346 "p": "compound", 

347 "h": "compound", 

348 "b": "idiomatic", 

349 } 

350 c_arg_value = clean_node(wxr, None, t_node.template_parameters.get("c", "")) 

351 for c_arg in c_arg_value.split("-"): 

352 if c_arg in TAGS: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true

353 word_entry.tags.append(TAGS[c_arg]) 

354 FORM_TAGS = { 

355 "nd": "noun-from-verb", 

356 "niha": "present", 

357 "borî": "past", 

358 "subj": "subjunctive", 

359 } 

360 for form_arg, tag in FORM_TAGS.items(): 

361 extract_lêker_template_form(wxr, word_entry, t_node, form_arg, tag) 

362 

363 

364def extract_lêker_template_form( 

365 wxr: WiktextractContext, 

366 word_entry: WordEntry, 

367 t_node: TemplateNode, 

368 arg_name: str, 

369 tag: str, 

370) -> None: 

371 if arg_name not in t_node.template_parameters: 

372 return 

373 form = Form( 

374 form=clean_node(wxr, None, t_node.template_parameters[arg_name]), 

375 tags=[tag], 

376 roman=clean_node( 

377 wxr, None, t_node.template_parameters.get(arg_name + "tr", "") 

378 ), 

379 ) 

380 if form.form != "": 380 ↛ 382line 380 didn't jump to line 382 because the condition on line 380 was always true

381 word_entry.forms.append(form) 

382 if arg_name != "nd" and not arg_name.endswith("2"): 382 ↛ exitline 382 didn't return from function 'extract_lêker_template_form' because the condition on line 382 was always true

383 extract_lêker_template_form( 

384 wxr, word_entry, t_node, arg_name + "2", tag 

385 ) 

386 

387 

388def extract_form_of_template( 

389 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

390) -> None: 

391 # Şablon:formeke peyvê 

392 is_alt_of = False 

393 break_first_arg = True 

394 if t_node.template_name in ["formeke peyvê", "inflection of"]: 

395 form_args = ["cude", 3, 2] 

396 elif t_node.template_name in [ 

397 "dem2", 

398 "guherto", 

399 "guharto", 

400 "rastnivîs", 

401 "şaşnivîs", 

402 "şaşî", 

403 "kevnbûyî", 

404 "binêre2", 

405 "bnr2", 

406 "awayekî din", 

407 "ad", 

408 "komparatîv", 

409 "kom", 

410 "sûperlatîv", 

411 "sûp", 

412 "dema-bê", 

413 "dema-fireh", 

414 "raboriya-sade", 

415 ]: 

416 form_args = [2] 

417 elif t_node.template_name.endswith( 417 ↛ 420line 417 didn't jump to line 420 because the condition on line 417 was never true

418 ("-dema-bê", "-dema-bê-p", "-dema-niha", "-dema-niha-p", "-fermanî") 

419 ): 

420 form_args = [1] 

421 elif t_node.template_name == "dem": 421 ↛ 422line 421 didn't jump to line 422 because the condition on line 421 was never true

422 form_args = [3] 

423 elif t_node.template_name == "rehê dema niha": 423 ↛ 424line 423 didn't jump to line 424 because the condition on line 423 was never true

424 extract_rehê_dema_niha_template(wxr, sense, t_node) 

425 return 

426 elif t_node.template_name in ["binêre", "bnr"]: 426 ↛ 431line 426 didn't jump to line 431 because the condition on line 426 was always true

427 form_args = [1, 2, 3, 4] 

428 is_alt_of = True 

429 break_first_arg = False 

430 else: 

431 form_args = [] 

432 for arg in form_args: 

433 form_str = clean_node( 

434 wxr, None, t_node.template_parameters.get(arg, "") 

435 ) 

436 if form_str != "": 

437 if is_alt_of: 

438 sense.alt_of.append(AltForm(word=form_str)) 

439 else: 

440 sense.form_of.append(AltForm(word=form_str)) 

441 if is_alt_of and "alt-of" not in sense.tags: 

442 sense.tags.append("alt-of") 

443 elif not is_alt_of and "form-of" not in sense.tags: 

444 sense.tags.append("form-of") 

445 if t_node.template_name in ["formeke peyvê", "inflection of"]: 

446 for tag_arg in count(4): 446 ↛ 458line 446 didn't jump to line 458 because the loop on line 446 didn't complete

447 if tag_arg not in t_node.template_parameters: 

448 break 

449 raw_tag = clean_node( 

450 wxr, None, t_node.template_parameters[tag_arg] 

451 ).capitalize() 

452 if raw_tag in TAGS: 

453 tr_tag = TAGS[raw_tag] 

454 if isinstance(tr_tag, str): 454 ↛ 456line 454 didn't jump to line 456 because the condition on line 454 was always true

455 sense.tags.append(tr_tag) 

456 elif isinstance(tr_tag, list): 

457 sense.tags.extend(tr_tag) 

458 if break_first_arg: 

459 break 

460 

461 

462def extract_rehê_dema_niha_template( 

463 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

464) -> None: 

465 expanded_node = wxr.wtp.parse( 

466 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

467 ) 

468 for bold_node in expanded_node.find_child(NodeKind.BOLD): 

469 word = clean_node(wxr, None, bold_node) 

470 if word != "": 

471 sense.form_of.append(AltForm(word=word)) 

472 if "form-of" not in sense.tags: 

473 sense.tags.append("form-of") 

474 

475 

476def extract_binêre_el_template( 

477 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

478) -> None: 

479 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

480 if first_arg != "": 480 ↛ exitline 480 didn't return from function 'extract_binêre_el_template' because the condition on line 480 was always true

481 sense = ( 

482 word_entry.senses[-1] 

483 if len(word_entry.senses) > 0 

484 else Sense(tags=["no-gloss"]) 

485 ) 

486 sense.alt_of.append(AltForm(word=first_arg)) 

487 sense.tags.extend(["alt-of", "obsolete"]) 

488 if len(word_entry.senses) == 0: 488 ↛ exitline 488 didn't return from function 'extract_binêre_el_template' because the condition on line 488 was always true

489 word_entry.senses.append(sense)