Coverage for src/wiktextract/extractor/tr/pos.py: 89%

1import re

3from wikitextprocessor import (

4 HTMLNode,

5 LevelNode,

6 NodeKind,

7 TemplateNode,

8 WikiNode,

11from ...page import clean_node

12from ...wxr_context import WiktextractContext

13from .example import extract_example_list_item

14from .models import AltForm, Example, Form, Sense, WordEntry

15from .section_titles import POS_DATA

16from .tags import translate_raw_tags

19def extract_pos_section(

20 wxr: WiktextractContext,

21 page_data: list[WordEntry],

22 base_data: WordEntry,

23 level_node: LevelNode,

24 pos_title: str,

25) -> None:

26 page_data.append(base_data.model_copy(deep=True))

27 page_data[-1].pos_title = pos_title

28 pos_data = POS_DATA[pos_title]

29 page_data[-1].pos = pos_data["pos"]

30 page_data[-1].tags.extend(pos_data.get("tags", []))

32 gloss_list_index = len(level_node.children)

33 for index, node in enumerate(level_node.children):

34 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

35 for list_item in node.find_child(NodeKind.LIST_ITEM):

36 if node.sarg == "#" or (

37 node.sarg == ":"

38 and len(list_item.children) > 0

39 and isinstance(list_item.children[0], str)

40 and re.search(r"\[\d+\]", list_item.children[0]) is not None

41 ):

42 extract_gloss_list_item(wxr, page_data[-1], list_item)

43 if index < gloss_list_index:

44 gloss_list_index = index

46 extract_pos_header_nodes(

47 wxr, page_data[-1], level_node.children[:gloss_list_index]

48 )

49 translate_raw_tags(page_data[-1])

52# https://tr.wiktionary.org/wiki/Kategori:Çekim_şablonları

53# https://tr.wiktionary.org/wiki/Kategori:Tanım_şablonları

54FORM_OF_TEMPLATES = {

55 "çekim",

56 "karşılaştırma",

57 "Komp.",

58 "artıklık",

59 "üstünlük",

60 "Sup.",

61 "tr-çekim",

62 "tr-çekim:m1",

63 "tr-ünlü-çekimi",

64 "ad-hâl",

65 "hâl",

66 "çoğul ad",

67 "çoğulu",

68 "çoğul isim",

69 "ota-çekim",

70 "ikil ad",

71 "ikil",

72 "çoğul kısaltma",

73 "el-ortaç çekimi",

74 "eylem-hâl",

75 "fiil",

76 "eylem",

77 "dişil tekili",

78 "dişil çoğulu",

79 "eril çoğulu",

80 "el-çekim:ος-η-ο",

81 "el-çekim:βιώνω",

82 "el-çekim:ος-α-ο",

83 "el-çekim:θεωρώ",

84 "el-çekim:ορίζω",

85 "yanlış yazım",

86 "doğrusu",

87 "Doğrusu",

88 "imla hatası",

89 "ön ad",

90 "sıfat",

91 "kısaltma",

92 "akronim",

93 "farklı",

94 "alternatif",

95 "kısa",

96 "mastarı",

97 "ar-mastarı",

98 "romanizasyon",

99}

100

101

102def extract_gloss_list_item(

103 wxr: WiktextractContext,

104 word_entry: WordEntry,

105 list_item: WikiNode,

106 parent_sense: Sense | None = None,

107) -> None:

108 sense = (

109 parent_sense.model_copy(deep=True)

110 if parent_sense is not None

111 else Sense()

112 )

113 gloss_nodes = []

114 for node in list_item.children:

115 if isinstance(node, TemplateNode) and node.template_name in [

116 "t",

117 "terim",

118 ]:

119 extract_terim_template(wxr, sense, node)

120 elif (

121 isinstance(node, TemplateNode)

122 and node.template_name in FORM_OF_TEMPLATES

123 ):

124 extract_form_of_template(wxr, word_entry, sense, node)

125 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):

126 gloss_nodes.append(node)

127

128 gloss_str = clean_node(wxr, sense, gloss_nodes)

129 gloss_str = re.sub(r"^\[\d+\]\s*", "", gloss_str)

130 if gloss_str != "":

131 sense.glosses.append(gloss_str)

132 translate_raw_tags(sense)

133 word_entry.senses.append(sense)

134

135 for child_list in list_item.find_child(NodeKind.LIST):

136 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):

137 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):

138 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)

139 elif child_list.sarg.startswith( 139 ↛ 135line 139 didn't jump to line 135 because the condition on line 139 was always true

140 ("#", ":")

141 ) and child_list.sarg.endswith((":", "*")):

142 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):

143 example = Example(text="")

144 extract_example_list_item(

145 wxr, word_entry, child_list_item, example

146 )

147 if example.text != "":

148 sense.examples.append(example)

149

150

151def extract_terim_template(

152 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode

153) -> None:

154 # https://tr.wiktionary.org/wiki/Şablon:terim

155 raw_tags_str = clean_node(wxr, sense, t_node).strip("() ")

156 for raw_tag in raw_tags_str.split(","):

157 raw_tag = raw_tag.strip()

158 if raw_tag not in ["", "'"]: 158 ↛ 156line 158 didn't jump to line 156 because the condition on line 158 was always true

159 sense.raw_tags.append(raw_tag)

160

161

162def extract_pos_header_nodes(

163 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]

164) -> None:

165 for node in nodes:

166 if isinstance(node, TemplateNode) and (

167 node.template_name.startswith((word_entry.lang_code + "-"))

168 or node.template_name == "başlık başı"

169 ):

170 extract_pos_header_template(wxr, word_entry, node)

171 elif isinstance(node, TemplateNode) and node.template_name in [

172 "sahiplik",

173 "sahiplik eki",

174 "özel çoğul",

175 ]:

176 extract_sahiplik_template(wxr, word_entry, node)

177

178

179def extract_pos_header_template(

180 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

181) -> None:

182 # Şablon:başlık_başı, Şablon:tr-ad

183 expanded_node = wxr.wtp.parse(

184 wxr.wtp.node_to_wikitext(t_node), expand_all=True

185 )

186 raw_tags = []

187 last_italic_is_or = False

188 for node in expanded_node.children:

189 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:

190 raw_tag = clean_node(wxr, None, node)

191 if raw_tag not in ["", "veya"]:

192 raw_tags.append(raw_tag)

193 last_italic_is_or = raw_tag == "veya"

194 elif isinstance(node, HTMLNode) and node.tag == "b":

195 word = clean_node(wxr, None, node)

196 if word != "": 196 ↛ 188line 196 didn't jump to line 188 because the condition on line 196 was always true

197 form = Form(form=word, raw_tags=raw_tags)

198 if last_italic_is_or:

199 form.raw_tags.extend(word_entry.forms[-1].raw_tags)

200 form.tags.extend(word_entry.forms[-1].tags)

201 translate_raw_tags(form)

202 word_entry.forms.append(form)

203 raw_tags.clear()

204 elif (

205 isinstance(node, HTMLNode)

206 and node.tag == "span"

207 and "gender" in node.attrs.get("class", "")

208 ):

209 for abbr_tag in node.find_html("abbr"):

210 gender_raw_tag = clean_node(wxr, None, abbr_tag)

211 if gender_raw_tag not in ["", "?"]: 211 ↛ 209line 211 didn't jump to line 209 because the condition on line 211 was always true

212 word_entry.raw_tags.append(gender_raw_tag)

213 elif (

214 isinstance(node, HTMLNode)

215 and node.tag == "strong"

216 and "headword" in node.attrs.get("class", "")

217 ):

218 form_str = clean_node(wxr, None, node)

219 if form_str not in ["", wxr.wtp.title]:

220 word_entry.forms.append(Form(form=form_str, tags=["canonical"]))

221 elif (

222 isinstance(node, HTMLNode)

223 and node.tag == "span"

224 and "headword-tr" in node.attrs.get("class", "")

225 ):

226 roman = clean_node(wxr, None, node)

227 if roman != "": 227 ↛ 188line 227 didn't jump to line 188 because the condition on line 227 was always true

228 word_entry.forms.append(

229 Form(form=roman, tags=["transliteration"])

230 )

231

232 clean_node(wxr, word_entry, expanded_node)

233

234

235# https://tr.wiktionary.org/wiki/Kategori:Tanım_şablonları

236BOLD_FORM_OF_TEMPLATE_TAGS = {

237 "akronim": "acronym",

238 "kısaltma": "abbreviation",

239 "kısa": "short-form",

240 "mastarı": "noun-from-verb",

241 "ar-mastarı": "noun-from-verb",

242}

243FORM_OF_TEMPLATE_TAGS = {

244 "romanizasyon": "romanization",

245 "yanlış yazım": "misspelling",

246 "doğrusu": "misspelling",

247 "Doğrusu": "misspelling",

248 "imla hatası": "misspelling",

249}

250

251ALT_OF_TEMPLATES = {

252 "farklı",

253 "alternatif",

254 "yanlış yazım",

255 "doğrusu",

256 "Doğrusu",

257 "imla hatası",

258}

259

260

261def extract_form_of_template(

262 wxr: WiktextractContext,

263 word_entry: WordEntry,

264 sense: Sense,

265 t_node: TemplateNode,

266) -> None:

267 # https://tr.wiktionary.org/wiki/Şablon:çekim

268 expanded_node = wxr.wtp.parse(

269 wxr.wtp.node_to_wikitext(t_node), expand_all=True

270 )

271 word = ""

272 if t_node.template_name in BOLD_FORM_OF_TEMPLATE_TAGS:

273 sense.tags.append(BOLD_FORM_OF_TEMPLATE_TAGS[t_node.template_name])

274 for bold_node in expanded_node.find_child(NodeKind.BOLD): 274 ↛ 290line 274 didn't jump to line 290 because the loop on line 274 didn't complete

275 word = clean_node(wxr, None, bold_node)

276 break

277 else:

278 if t_node.template_name in FORM_OF_TEMPLATE_TAGS: 278 ↛ 279line 278 didn't jump to line 279 because the condition on line 278 was never true

279 sense.tags.append(FORM_OF_TEMPLATE_TAGS[t_node.template_name])

280 for i_tag in expanded_node.find_html_recursively("i"):

281 word = clean_node(wxr, None, i_tag)

282 break

283 if word == "":

284 for link_node in expanded_node.find_child_recursively( 284 ↛ 290line 284 didn't jump to line 290 because the loop on line 284 didn't complete

285 NodeKind.LINK

286 ):

287 word = clean_node(wxr, None, link_node)

288 break

289

290 if word != "" and t_node.template_name in ALT_OF_TEMPLATES: 290 ↛ 291line 290 didn't jump to line 291 because the condition on line 290 was never true

291 sense.tags.append("alt-of")

292 sense.alt_of.append(AltForm(word=word))

293 elif word != "": 293 ↛ 297line 293 didn't jump to line 297 because the condition on line 293 was always true

294 sense.tags.append("form-of")

295 sense.form_of.append(AltForm(word=word))

296

297 clean_node(wxr, sense, expanded_node)

298 if expanded_node.contain_node(NodeKind.LIST):

299 for index, list_node in expanded_node.find_child( 299 ↛ exitline 299 didn't return from function 'extract_form_of_template' because the loop on line 299 didn't complete

300 NodeKind.LIST, with_index=True

301 ):

302 gloss = clean_node(wxr, None, expanded_node.children[:index])

303 if gloss != "": 303 ↛ 307line 303 didn't jump to line 307 because the condition on line 303 was always true

304 sense.glosses.append(gloss)

305 translate_raw_tags(sense)

306 word_entry.senses.append(sense)

307 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

308 extract_gloss_list_item(wxr, word_entry, list_item, sense)

309 break

310 else:

311 gloss = clean_node(wxr, None, expanded_node)

312 if gloss != "": 312 ↛ exitline 312 didn't return from function 'extract_form_of_template' because the condition on line 312 was always true

313 sense.glosses.append(gloss)

314 translate_raw_tags(sense)

315 word_entry.senses.append(sense)

316

317

318def extract_sahiplik_template(

319 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

320) -> None:

321 # https://tr.wiktionary.org/wiki/Şablon:sahiplik, Şablon:özel_çoğul

322 expanded_node = wxr.wtp.parse(

323 wxr.wtp.node_to_wikitext(t_node), expand_all=True

324 )

325 form = Form(form="")

326 for node in expanded_node.children:

327 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:

328 raw_tag = clean_node(wxr, None, node)

329 if raw_tag != "": 329 ↛ 326line 329 didn't jump to line 326 because the condition on line 329 was always true

330 form.raw_tags.append(raw_tag)

331 elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:

332 if t_node.template_name in ["sahiplik", "sahiplik eki"]:

333 for link_node in node.find_child(NodeKind.LINK):

334 if len(link_node.largs) > 0: 334 ↛ 333line 334 didn't jump to line 333 because the condition on line 334 was always true

335 form.form = clean_node(wxr, None, link_node.largs[0])

336 else:

337 form.form = clean_node(wxr, None, node)

338 if form.form != "": 338 ↛ exitline 338 didn't return from function 'extract_sahiplik_template' because the condition on line 338 was always true

339 translate_raw_tags(form)

340 word_entry.forms.append(form)

341

342

343def extract_note_section(

344 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

345) -> None:

346 for list_node in level_node.find_child(NodeKind.LIST):

347 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

348 note = clean_node(wxr, None, list_item.children)

349 if note != "":

350 word_entry.notes.append(note)

Coverage for src / wiktextract / extractor / tr / pos.py: 89%

162 statements