Coverage for src/wiktextract/extractor/tr/pos.py: 89%

162 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2 

3from wikitextprocessor import ( 

4 HTMLNode, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .example import extract_example_list_item 

14from .models import AltForm, Example, Form, Sense, WordEntry 

15from .section_titles import POS_DATA 

16from .tags import translate_raw_tags 

17 

18 

19def extract_pos_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24 pos_title: str, 

25) -> None: 

26 page_data.append(base_data.model_copy(deep=True)) 

27 page_data[-1].pos_title = pos_title 

28 pos_data = POS_DATA[pos_title] 

29 page_data[-1].pos = pos_data["pos"] 

30 page_data[-1].tags.extend(pos_data.get("tags", [])) 

31 

32 gloss_list_index = len(level_node.children) 

33 for index, node in enumerate(level_node.children): 

34 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

35 for list_item in node.find_child(NodeKind.LIST_ITEM): 

36 if node.sarg == "#" or ( 

37 node.sarg == ":" 

38 and len(list_item.children) > 0 

39 and isinstance(list_item.children[0], str) 

40 and re.search(r"\[\d+\]", list_item.children[0]) is not None 

41 ): 

42 extract_gloss_list_item(wxr, page_data[-1], list_item) 

43 if index < gloss_list_index: 

44 gloss_list_index = index 

45 

46 extract_pos_header_nodes( 

47 wxr, page_data[-1], level_node.children[:gloss_list_index] 

48 ) 

49 translate_raw_tags(page_data[-1]) 

50 

51 

52# https://tr.wiktionary.org/wiki/Kategori:Çekim_şablonları 

53# https://tr.wiktionary.org/wiki/Kategori:Tanım_şablonları 

54FORM_OF_TEMPLATES = { 

55 "çekim", 

56 "karşılaştırma", 

57 "Komp.", 

58 "artıklık", 

59 "üstünlük", 

60 "Sup.", 

61 "tr-çekim", 

62 "tr-çekim:m1", 

63 "tr-ünlü-çekimi", 

64 "ad-hâl", 

65 "hâl", 

66 "çoğul ad", 

67 "çoğulu", 

68 "çoğul isim", 

69 "ota-çekim", 

70 "ikil ad", 

71 "ikil", 

72 "çoğul kısaltma", 

73 "el-ortaç çekimi", 

74 "eylem-hâl", 

75 "fiil", 

76 "eylem", 

77 "dişil tekili", 

78 "dişil çoğulu", 

79 "eril çoğulu", 

80 "el-çekim:ος-η-ο", 

81 "el-çekim:βιώνω", 

82 "el-çekim:ος-α-ο", 

83 "el-çekim:θεωρώ", 

84 "el-çekim:ορίζω", 

85 "yanlış yazım", 

86 "doğrusu", 

87 "Doğrusu", 

88 "imla hatası", 

89 "ön ad", 

90 "sıfat", 

91 "kısaltma", 

92 "akronim", 

93 "farklı", 

94 "alternatif", 

95 "kısa", 

96 "mastarı", 

97 "ar-mastarı", 

98 "romanizasyon", 

99} 

100 

101 

102def extract_gloss_list_item( 

103 wxr: WiktextractContext, 

104 word_entry: WordEntry, 

105 list_item: WikiNode, 

106 parent_sense: Sense | None = None, 

107) -> None: 

108 sense = ( 

109 parent_sense.model_copy(deep=True) 

110 if parent_sense is not None 

111 else Sense() 

112 ) 

113 gloss_nodes = [] 

114 for node in list_item.children: 

115 if isinstance(node, TemplateNode) and node.template_name in [ 

116 "t", 

117 "terim", 

118 ]: 

119 extract_terim_template(wxr, sense, node) 

120 elif ( 

121 isinstance(node, TemplateNode) 

122 and node.template_name in FORM_OF_TEMPLATES 

123 ): 

124 extract_form_of_template(wxr, word_entry, sense, node) 

125 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

126 gloss_nodes.append(node) 

127 

128 gloss_str = clean_node(wxr, sense, gloss_nodes) 

129 gloss_str = re.sub(r"^\[\d+\]\s*", "", gloss_str) 

130 if gloss_str != "": 

131 sense.glosses.append(gloss_str) 

132 translate_raw_tags(sense) 

133 word_entry.senses.append(sense) 

134 

135 for child_list in list_item.find_child(NodeKind.LIST): 

136 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

137 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

138 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

139 elif child_list.sarg.startswith( 139 ↛ 135line 139 didn't jump to line 135 because the condition on line 139 was always true

140 ("#", ":") 

141 ) and child_list.sarg.endswith((":", "*")): 

142 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

143 example = Example(text="") 

144 extract_example_list_item( 

145 wxr, word_entry, child_list_item, example 

146 ) 

147 if example.text != "": 

148 sense.examples.append(example) 

149 

150 

151def extract_terim_template( 

152 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

153) -> None: 

154 # https://tr.wiktionary.org/wiki/Şablon:terim 

155 raw_tags_str = clean_node(wxr, sense, t_node).strip("() ") 

156 for raw_tag in raw_tags_str.split(","): 

157 raw_tag = raw_tag.strip() 

158 if raw_tag not in ["", "'"]: 158 ↛ 156line 158 didn't jump to line 156 because the condition on line 158 was always true

159 sense.raw_tags.append(raw_tag) 

160 

161 

162def extract_pos_header_nodes( 

163 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

164) -> None: 

165 for node in nodes: 

166 if isinstance(node, TemplateNode) and ( 

167 node.template_name.startswith((word_entry.lang_code + "-")) 

168 or node.template_name == "başlık başı" 

169 ): 

170 extract_pos_header_template(wxr, word_entry, node) 

171 elif isinstance(node, TemplateNode) and node.template_name in [ 

172 "sahiplik", 

173 "sahiplik eki", 

174 "özel çoğul", 

175 ]: 

176 extract_sahiplik_template(wxr, word_entry, node) 

177 

178 

179def extract_pos_header_template( 

180 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

181) -> None: 

182 # Şablon:başlık_başı, Şablon:tr-ad 

183 expanded_node = wxr.wtp.parse( 

184 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

185 ) 

186 raw_tags = [] 

187 last_italic_is_or = False 

188 for node in expanded_node.children: 

189 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

190 raw_tag = clean_node(wxr, None, node) 

191 if raw_tag not in ["", "veya"]: 

192 raw_tags.append(raw_tag) 

193 last_italic_is_or = raw_tag == "veya" 

194 elif isinstance(node, HTMLNode) and node.tag == "b": 

195 word = clean_node(wxr, None, node) 

196 if word != "": 196 ↛ 188line 196 didn't jump to line 188 because the condition on line 196 was always true

197 form = Form(form=word, raw_tags=raw_tags) 

198 if last_italic_is_or: 

199 form.raw_tags.extend(word_entry.forms[-1].raw_tags) 

200 form.tags.extend(word_entry.forms[-1].tags) 

201 translate_raw_tags(form) 

202 word_entry.forms.append(form) 

203 raw_tags.clear() 

204 elif ( 

205 isinstance(node, HTMLNode) 

206 and node.tag == "span" 

207 and "gender" in node.attrs.get("class", "") 

208 ): 

209 for abbr_tag in node.find_html("abbr"): 

210 gender_raw_tag = clean_node(wxr, None, abbr_tag) 

211 if gender_raw_tag not in ["", "?"]: 211 ↛ 209line 211 didn't jump to line 209 because the condition on line 211 was always true

212 word_entry.raw_tags.append(gender_raw_tag) 

213 elif ( 

214 isinstance(node, HTMLNode) 

215 and node.tag == "strong" 

216 and "headword" in node.attrs.get("class", "") 

217 ): 

218 form_str = clean_node(wxr, None, node) 

219 if form_str not in ["", wxr.wtp.title]: 

220 word_entry.forms.append(Form(form=form_str, tags=["canonical"])) 

221 elif ( 

222 isinstance(node, HTMLNode) 

223 and node.tag == "span" 

224 and "headword-tr" in node.attrs.get("class", "") 

225 ): 

226 roman = clean_node(wxr, None, node) 

227 if roman != "": 227 ↛ 188line 227 didn't jump to line 188 because the condition on line 227 was always true

228 word_entry.forms.append( 

229 Form(form=roman, tags=["transliteration"]) 

230 ) 

231 

232 clean_node(wxr, word_entry, expanded_node) 

233 

234 

235# https://tr.wiktionary.org/wiki/Kategori:Tanım_şablonları 

236BOLD_FORM_OF_TEMPLATE_TAGS = { 

237 "akronim": "acronym", 

238 "kısaltma": "abbreviation", 

239 "kısa": "short-form", 

240 "mastarı": "noun-from-verb", 

241 "ar-mastarı": "noun-from-verb", 

242} 

243FORM_OF_TEMPLATE_TAGS = { 

244 "romanizasyon": "romanization", 

245 "yanlış yazım": "misspelling", 

246 "doğrusu": "misspelling", 

247 "Doğrusu": "misspelling", 

248 "imla hatası": "misspelling", 

249} 

250 

251ALT_OF_TEMPLATES = { 

252 "farklı", 

253 "alternatif", 

254 "yanlış yazım", 

255 "doğrusu", 

256 "Doğrusu", 

257 "imla hatası", 

258} 

259 

260 

261def extract_form_of_template( 

262 wxr: WiktextractContext, 

263 word_entry: WordEntry, 

264 sense: Sense, 

265 t_node: TemplateNode, 

266) -> None: 

267 # https://tr.wiktionary.org/wiki/Şablon:çekim 

268 expanded_node = wxr.wtp.parse( 

269 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

270 ) 

271 word = "" 

272 if t_node.template_name in BOLD_FORM_OF_TEMPLATE_TAGS: 

273 sense.tags.append(BOLD_FORM_OF_TEMPLATE_TAGS[t_node.template_name]) 

274 for bold_node in expanded_node.find_child(NodeKind.BOLD): 274 ↛ 290line 274 didn't jump to line 290 because the loop on line 274 didn't complete

275 word = clean_node(wxr, None, bold_node) 

276 break 

277 else: 

278 if t_node.template_name in FORM_OF_TEMPLATE_TAGS: 278 ↛ 279line 278 didn't jump to line 279 because the condition on line 278 was never true

279 sense.tags.append(FORM_OF_TEMPLATE_TAGS[t_node.template_name]) 

280 for i_tag in expanded_node.find_html_recursively("i"): 

281 word = clean_node(wxr, None, i_tag) 

282 break 

283 if word == "": 

284 for link_node in expanded_node.find_child_recursively( 284 ↛ 290line 284 didn't jump to line 290 because the loop on line 284 didn't complete

285 NodeKind.LINK 

286 ): 

287 word = clean_node(wxr, None, link_node) 

288 break 

289 

290 if word != "" and t_node.template_name in ALT_OF_TEMPLATES: 290 ↛ 291line 290 didn't jump to line 291 because the condition on line 290 was never true

291 sense.tags.append("alt-of") 

292 sense.alt_of.append(AltForm(word=word)) 

293 elif word != "": 293 ↛ 297line 293 didn't jump to line 297 because the condition on line 293 was always true

294 sense.tags.append("form-of") 

295 sense.form_of.append(AltForm(word=word)) 

296 

297 clean_node(wxr, sense, expanded_node) 

298 if expanded_node.contain_node(NodeKind.LIST): 

299 for index, list_node in expanded_node.find_child( 299 ↛ exitline 299 didn't return from function 'extract_form_of_template' because the loop on line 299 didn't complete

300 NodeKind.LIST, with_index=True 

301 ): 

302 gloss = clean_node(wxr, None, expanded_node.children[:index]) 

303 if gloss != "": 303 ↛ 307line 303 didn't jump to line 307 because the condition on line 303 was always true

304 sense.glosses.append(gloss) 

305 translate_raw_tags(sense) 

306 word_entry.senses.append(sense) 

307 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

308 extract_gloss_list_item(wxr, word_entry, list_item, sense) 

309 break 

310 else: 

311 gloss = clean_node(wxr, None, expanded_node) 

312 if gloss != "": 312 ↛ exitline 312 didn't return from function 'extract_form_of_template' because the condition on line 312 was always true

313 sense.glosses.append(gloss) 

314 translate_raw_tags(sense) 

315 word_entry.senses.append(sense) 

316 

317 

318def extract_sahiplik_template( 

319 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

320) -> None: 

321 # https://tr.wiktionary.org/wiki/Şablon:sahiplik, Şablon:özel_çoğul 

322 expanded_node = wxr.wtp.parse( 

323 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

324 ) 

325 form = Form(form="") 

326 for node in expanded_node.children: 

327 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

328 raw_tag = clean_node(wxr, None, node) 

329 if raw_tag != "": 329 ↛ 326line 329 didn't jump to line 326 because the condition on line 329 was always true

330 form.raw_tags.append(raw_tag) 

331 elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: 

332 if t_node.template_name in ["sahiplik", "sahiplik eki"]: 

333 for link_node in node.find_child(NodeKind.LINK): 

334 if len(link_node.largs) > 0: 334 ↛ 333line 334 didn't jump to line 333 because the condition on line 334 was always true

335 form.form = clean_node(wxr, None, link_node.largs[0]) 

336 else: 

337 form.form = clean_node(wxr, None, node) 

338 if form.form != "": 338 ↛ exitline 338 didn't return from function 'extract_sahiplik_template' because the condition on line 338 was always true

339 translate_raw_tags(form) 

340 word_entry.forms.append(form) 

341 

342 

343def extract_note_section( 

344 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

345) -> None: 

346 for list_node in level_node.find_child(NodeKind.LIST): 

347 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

348 note = clean_node(wxr, None, list_item.children) 

349 if note != "": 

350 word_entry.notes.append(note)