Coverage for src/wiktextract/extractor/ku/pos.py: 78%

209 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2from itertools import count 

3 

4from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .example import extract_example_list_item 

9from .form_table import ( 

10 extract_ku_tewîn_lk_template, 

11 extract_ku_tewîn_nav_template, 

12) 

13from .models import AltForm, Form, Sense, WordEntry 

14from .section_titles import POS_DATA 

15from .tags import TAGS, translate_raw_tags 

16 

17FORM_OF_TEMPLATES = frozenset( 

18 [ 

19 "formeke peyvê", 

20 "inflection of", 

21 "dem2", 

22 "guherto", 

23 "guharto", 

24 "rastnivîs", 

25 "şaşnivîs", 

26 "şaşî", 

27 "kevnbûyî", 

28 "binêre", 

29 "bnr", 

30 "binêre2", 

31 "bnr2", 

32 "awayekî din", 

33 "ad", 

34 "komparatîv", 

35 "kom", 

36 "sûperlatîv", 

37 "sûp", 

38 "dem", 

39 "dema-bê", 

40 "dema-fireh", 

41 "raboriya-sade", 

42 "rehê dema niha", 

43 ] 

44) 

45FORM_OF_TEMPLATE_SUFFIXES = ( 

46 "-dema-bê", 

47 "-dema-bê-p", 

48 "-dema-niha", 

49 "-dema-niha-p", 

50 "-fermanî", 

51) 

52 

53 

54def extract_pos_section( 

55 wxr: WiktextractContext, 

56 page_data: list[WordEntry], 

57 base_data: WordEntry, 

58 level_node: LevelNode, 

59 pos_title: str, 

60) -> None: 

61 page_data.append(base_data.model_copy(deep=True)) 

62 page_data[-1].pos_title = pos_title 

63 pos_data = POS_DATA[pos_title] 

64 page_data[-1].pos = pos_data["pos"] 

65 page_data[-1].tags.extend(pos_data.get("tags", [])) 

66 

67 gloss_list_index = len(level_node.children) 

68 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

69 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

70 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 70 ↛ 69line 70 didn't jump to line 69 because the condition on line 70 was always true

71 extract_gloss_list_item(wxr, page_data[-1], list_item) 

72 if index < gloss_list_index: 72 ↛ 69line 72 didn't jump to line 69 because the condition on line 72 was always true

73 gloss_list_index = index 

74 

75 extract_pos_header_nodes( 

76 wxr, page_data[-1], level_node.children[:gloss_list_index] 

77 ) 

78 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

79 if t_node.template_name == "binêre/el": 

80 extract_binêre_el_template(wxr, page_data[-1], t_node) 

81 elif ( 

82 t_node.template_name in FORM_OF_TEMPLATES 

83 or t_node.template_name.endswith(FORM_OF_TEMPLATE_SUFFIXES) 

84 ): 

85 sense = Sense() 

86 extract_form_of_template(wxr, sense, t_node) 

87 gloss = clean_node(wxr, sense, t_node) 

88 if gloss != "": 88 ↛ 90line 88 didn't jump to line 90 because the condition on line 88 was always true

89 sense.glosses.append(gloss) 

90 page_data[-1].senses.append(sense) 

91 

92 

93def extract_gloss_list_item( 

94 wxr: WiktextractContext, 

95 word_entry: WordEntry, 

96 list_item: WikiNode, 

97 parent_sense: Sense | None = None, 

98) -> None: 

99 sense = ( 

100 parent_sense.model_copy(deep=True) 

101 if parent_sense is not None 

102 else Sense() 

103 ) 

104 gloss_nodes = [] 

105 for node in list_item.children: 

106 if isinstance(node, TemplateNode): 

107 if node.template_name in ["f", "ferhengok"]: 

108 extract_ferhengok_template(wxr, sense, node) 

109 elif ( 109 ↛ 115line 109 didn't jump to line 115 because the condition on line 109 was always true

110 node.template_name in FORM_OF_TEMPLATES 

111 or node.template_name.endswith(FORM_OF_TEMPLATE_SUFFIXES) 

112 ): 

113 extract_form_of_template(wxr, sense, node) 

114 gloss_nodes.append(node) 

115 elif node.template_name in ["bajar"]: 

116 clean_node(wxr, sense, node) 

117 sense.topics.append("city") 

118 else: 

119 t_node_text = clean_node(wxr, sense, node) 

120 if t_node_text.startswith("(") and t_node_text.endswith(")"): 

121 sense.raw_tags.append(t_node_text.strip("() ")) 

122 else: 

123 gloss_nodes.append(t_node_text) 

124 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

125 gloss_nodes.append(node) 

126 

127 gloss_str = clean_node(wxr, sense, gloss_nodes) 

128 if gloss_str != "": 128 ↛ 133line 128 didn't jump to line 133 because the condition on line 128 was always true

129 sense.glosses.append(gloss_str) 

130 translate_raw_tags(sense) 

131 word_entry.senses.append(sense) 

132 

133 for child_list in list_item.find_child(NodeKind.LIST): 

134 if child_list.sarg.startswith("#") and child_list.sarg.endswith( 

135 (":", "*") 

136 ): 

137 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

138 extract_example_list_item(wxr, word_entry, sense, e_list_item) 

139 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 139 ↛ 133line 139 didn't jump to line 133 because the condition on line 139 was always true

140 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

141 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

142 

143 if len(sense.glosses) == 0 and len(sense.examples) > 0: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 word_entry.senses.append(sense) 

145 

146 

147def extract_ferhengok_template( 

148 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

149) -> None: 

150 # https://ku.wiktionary.org/wiki/Şablon:ferhengok 

151 node_str = clean_node(wxr, sense, t_node).strip("() ") 

152 for raw_tag in re.split(r",| an | û ", node_str): 

153 raw_tag = raw_tag.strip() 

154 if raw_tag != "": 154 ↛ 152line 154 didn't jump to line 152 because the condition on line 154 was always true

155 sense.raw_tags.append(raw_tag) 

156 

157 

158# https://ku.wiktionary.org/wiki/Alîkarî:Cureyên_peyvan 

159POS_HEADER_TEMPLATES = frozenset( 

160 [ 

161 "navdêr", 

162 "serenav", 

163 "lêker", 

164 "rengdêr", 

165 "hoker", 

166 "cînav", 

167 "baneşan", 

168 "daçek", 

169 "pêşdaçek", 

170 "paşdaçek", 

171 "bazinedaçek", 

172 "girêdek", 

173 "artîkel", 

174 "pirtik", 

175 "navgir", 

176 "paşgir", 

177 "pêşgir", 

178 "reh", 

179 "biwêj", 

180 "hevok", 

181 "gp", 

182 "hejmar", 

183 "tîp", 

184 "sembol", 

185 "kurtenav", 

186 ] 

187) 

188 

189 

190def extract_pos_header_nodes( 

191 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

192) -> None: 

193 for node in nodes: 

194 if ( 

195 isinstance(node, TemplateNode) 

196 and node.template_name in POS_HEADER_TEMPLATES 

197 ): 

198 form = Form( 

199 form=clean_node( 

200 wxr, None, node.template_parameters.get("tr", "") 

201 ), 

202 tags=["romanization"], 

203 ) 

204 if form.form not in ["", "-"]: 

205 word_entry.forms.append(form) 

206 clean_node(wxr, word_entry, node) 

207 if isinstance(node, TemplateNode) and node.template_name in [ 

208 "navdêr", 

209 "serenav", 

210 ]: 

211 extract_navdêr_template(wxr, word_entry, node) 

212 elif isinstance(node, TemplateNode) and node.template_name == "lêker": 

213 extract_lêker_template(wxr, word_entry, node) 

214 elif isinstance(node, TemplateNode) and node.template_name in [ 

215 "ku-tewîn-nav", 

216 "ku-tew-nav", 

217 "ku-tewîn-rd", 

218 ]: 

219 extract_ku_tewîn_nav_template(wxr, word_entry, node) 

220 elif ( 

221 isinstance(node, TemplateNode) 

222 and node.template_name == "ku-tewîn-lk" 

223 ): 

224 extract_ku_tewîn_lk_template(wxr, word_entry, node) 

225 

226 

227def extract_navdêr_template( 

228 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

229) -> None: 

230 # https://ku.wiktionary.org/wiki/Şablon:navdêr 

231 # Şablon:serenav 

232 GENDERS = { 

233 "n": "masculine", 

234 "n+": "masculine-usually", 

235 "m": "feminine", 

236 "m+": "feminine-usually", 

237 "nt": "gender-neutral", 

238 "mn": ["feminine", "masculine"], 

239 "m/n": ["feminine", "masculine"], 

240 "g": "common-gender", 

241 } 

242 z_arg = clean_node(wxr, None, t_node.template_parameters.get("z", "")) 

243 if z_arg in GENDERS: 

244 tag = GENDERS[z_arg] 

245 if isinstance(tag, str): 245 ↛ 247line 245 didn't jump to line 247 because the condition on line 245 was always true

246 word_entry.tags.append(tag) 

247 elif isinstance(tag, list): 

248 word_entry.tags.extend(tag) 

249 NUMBERS = { 

250 "p": "plural", 

251 "p+": "plural-normally", 

252 "tp": "plural-only", 

253 "y": "singular", 

254 "nj": "uncountable", 

255 "j/nj": ["countable", "uncountable"], 

256 } 

257 j_arg = clean_node(wxr, None, t_node.template_parameters.get("j", "")) 

258 if j_arg in NUMBERS: 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true

259 tag = NUMBERS[j_arg] 

260 if isinstance(tag, str): 

261 word_entry.tags.append(tag) 

262 elif isinstance(tag, list): 

263 word_entry.tags.extend(tag) 

264 

265 FORMS = { 

266 "m": "feminine", 

267 "n": "masculine", 

268 "nt": "gender-neutral", 

269 "y": "singular", 

270 "p": "plural", 

271 "np": ["masculine", "plural"], 

272 "mp": ["feminine", "plural"], 

273 "lk": "verb-from-noun", 

274 "hanja": "Hanja", 

275 } 

276 for form_arg, tag in FORMS.items(): 

277 if form_arg not in t_node.template_parameters: 

278 continue 

279 extract_navdêr_template_form(wxr, word_entry, t_node, form_arg, tag) 

280 for index in count(2): 280 ↛ 276line 280 didn't jump to line 276 because the loop on line 280 didn't complete

281 form_arg += str(index) 

282 if form_arg not in t_node.template_parameters: 282 ↛ 284line 282 didn't jump to line 284 because the condition on line 282 was always true

283 break 

284 extract_navdêr_template_form(wxr, word_entry, t_node, form_arg, tag) 

285 

286 expanded_node = wxr.wtp.parse( 

287 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

288 ) 

289 for i_tag in expanded_node.find_html_recursively("i"): 

290 i_text = clean_node(wxr, None, i_tag) 

291 if i_text.startswith("(") and i_text.endswith(")"): 

292 word_entry.forms.append( 

293 Form(form=i_text.strip("() "), tags=["romanization"]) 

294 ) 

295 clean_node(wxr, word_entry, expanded_node) 

296 

297 

298def extract_navdêr_template_form( 

299 wxr: WiktextractContext, 

300 word_entry: WordEntry, 

301 t_node: TemplateNode, 

302 arg_name: str, 

303 tag: str | list[str], 

304) -> None: 

305 if arg_name not in t_node.template_parameters: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true

306 return 

307 form = Form( 

308 form=clean_node(wxr, None, t_node.template_parameters[arg_name]) 

309 ) 

310 if isinstance(tag, str): 310 ↛ 312line 310 didn't jump to line 312 because the condition on line 310 was always true

311 form.tags.append(tag) 

312 elif isinstance(tag, list): 

313 form.tags.extend(tag) 

314 if form.form != "": 314 ↛ exitline 314 didn't return from function 'extract_navdêr_template_form' because the condition on line 314 was always true

315 word_entry.forms.append(form) 

316 

317 

318def extract_lêker_template( 

319 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

320) -> None: 

321 # https://ku.wiktionary.org/wiki/Şablon:lêker 

322 TAGS = { 

323 "gh": "transitive", 

324 "ngh": "intransitive", 

325 "x": "proper-noun", 

326 "p": "compound", 

327 "h": "compound", 

328 "b": "idiomatic", 

329 } 

330 c_arg_value = clean_node(wxr, None, t_node.template_parameters.get("c", "")) 

331 for c_arg in c_arg_value.split("-"): 

332 if c_arg in TAGS: 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true

333 word_entry.tags.append(TAGS[c_arg]) 

334 FORM_TAGS = { 

335 "nd": "noun-from-verb", 

336 "niha": "present", 

337 "borî": "past", 

338 "subj": "subjunctive", 

339 } 

340 for form_arg, tag in FORM_TAGS.items(): 

341 extract_lêker_template_form(wxr, word_entry, t_node, form_arg, tag) 

342 

343 

344def extract_lêker_template_form( 

345 wxr: WiktextractContext, 

346 word_entry: WordEntry, 

347 t_node: TemplateNode, 

348 arg_name: str, 

349 tag: str, 

350) -> None: 

351 if arg_name not in t_node.template_parameters: 

352 return 

353 form = Form( 

354 form=clean_node(wxr, None, t_node.template_parameters[arg_name]), 

355 tags=[tag], 

356 roman=clean_node( 

357 wxr, None, t_node.template_parameters.get(arg_name + "tr", "") 

358 ), 

359 ) 

360 if form.form != "": 360 ↛ 362line 360 didn't jump to line 362 because the condition on line 360 was always true

361 word_entry.forms.append(form) 

362 if arg_name != "nd" and not arg_name.endswith("2"): 362 ↛ exitline 362 didn't return from function 'extract_lêker_template_form' because the condition on line 362 was always true

363 extract_lêker_template_form( 

364 wxr, word_entry, t_node, arg_name + "2", tag 

365 ) 

366 

367 

368def extract_form_of_template( 

369 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

370) -> None: 

371 # Şablon:formeke peyvê 

372 is_alt_of = False 

373 break_first_arg = True 

374 if t_node.template_name in ["formeke peyvê", "inflection of"]: 

375 form_args = ["cude", 3, 2] 

376 elif t_node.template_name in [ 

377 "dem2", 

378 "guherto", 

379 "guharto", 

380 "rastnivîs", 

381 "şaşnivîs", 

382 "şaşî", 

383 "kevnbûyî", 

384 "binêre2", 

385 "bnr2", 

386 "awayekî din", 

387 "ad", 

388 "komparatîv", 

389 "kom", 

390 "sûperlatîv", 

391 "sûp", 

392 "dema-bê", 

393 "dema-fireh", 

394 "raboriya-sade", 

395 ]: 

396 form_args = [2] 

397 elif t_node.template_name.endswith( 397 ↛ 400line 397 didn't jump to line 400 because the condition on line 397 was never true

398 ("-dema-bê", "-dema-bê-p", "-dema-niha", "-dema-niha-p", "-fermanî") 

399 ): 

400 form_args = [1] 

401 elif t_node.template_name == "dem": 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true

402 form_args = [3] 

403 elif t_node.template_name == "rehê dema niha": 403 ↛ 404line 403 didn't jump to line 404 because the condition on line 403 was never true

404 extract_rehê_dema_niha_template(wxr, sense, t_node) 

405 return 

406 elif t_node.template_name in ["binêre", "bnr"]: 406 ↛ 411line 406 didn't jump to line 411 because the condition on line 406 was always true

407 form_args = [1, 2, 3, 4] 

408 is_alt_of = True 

409 break_first_arg = False 

410 else: 

411 form_args = [] 

412 for arg in form_args: 

413 form_str = clean_node( 

414 wxr, None, t_node.template_parameters.get(arg, "") 

415 ) 

416 if form_str != "": 

417 if is_alt_of: 

418 sense.alt_of.append(AltForm(word=form_str)) 

419 else: 

420 sense.form_of.append(AltForm(word=form_str)) 

421 if is_alt_of and "alt-of" not in sense.tags: 

422 sense.tags.append("alt-of") 

423 elif not is_alt_of and "form-of" not in sense.tags: 

424 sense.tags.append("form-of") 

425 if t_node.template_name in ["formeke peyvê", "inflection of"]: 

426 for tag_arg in count(4): 426 ↛ 438line 426 didn't jump to line 438 because the loop on line 426 didn't complete

427 if tag_arg not in t_node.template_parameters: 

428 break 

429 raw_tag = clean_node( 

430 wxr, None, t_node.template_parameters[tag_arg] 

431 ).capitalize() 

432 if raw_tag in TAGS: 

433 tr_tag = TAGS[raw_tag] 

434 if isinstance(tr_tag, str): 434 ↛ 436line 434 didn't jump to line 436 because the condition on line 434 was always true

435 sense.tags.append(tr_tag) 

436 elif isinstance(tr_tag, list): 

437 sense.tags.extend(tr_tag) 

438 if break_first_arg: 

439 break 

440 

441 

442def extract_rehê_dema_niha_template( 

443 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

444) -> None: 

445 expanded_node = wxr.wtp.parse( 

446 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

447 ) 

448 for bold_node in expanded_node.find_child(NodeKind.BOLD): 

449 word = clean_node(wxr, None, bold_node) 

450 if word != "": 

451 sense.form_of.append(AltForm(word=word)) 

452 if "form-of" not in sense.tags: 

453 sense.tags.append("form-of") 

454 

455 

456def extract_binêre_el_template( 

457 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

458) -> None: 

459 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

460 if first_arg != "": 460 ↛ exitline 460 didn't return from function 'extract_binêre_el_template' because the condition on line 460 was always true

461 sense = ( 

462 word_entry.senses[-1] 

463 if len(word_entry.senses) > 0 

464 else Sense(tags=["no-gloss"]) 

465 ) 

466 sense.alt_of.append(AltForm(word=first_arg)) 

467 sense.tags.extend(["alt-of", "obsolete"]) 

468 if len(word_entry.senses) == 0: 468 ↛ exitline 468 didn't return from function 'extract_binêre_el_template' because the condition on line 468 was always true

469 word_entry.senses.append(sense)