Coverage for src/wiktextract/extractor/simple/pos.py: 80%

1from wikitextprocessor import NodeKind, TemplateArgs, TemplateNode, WikiNode

2from wikitextprocessor.parser import LEVEL_KIND_FLAGS

4from wiktextract import WiktextractContext

5from wiktextract.page import clean_node

7from .models import Example, Form, Linkage, Sense, TemplateData, WordEntry

8from .section_titles import POS_HEADINGS

9from .table import parse_pos_table

10from .tags_utils import convert_tags_in_sense

11from .text_utils import (

12 POS_ENDING_NUMBER_RE,

13 POS_TEMPLATE_NAMES,

14 STRIP_PUNCTUATION,

15)

17# from wiktextract.wxr_logging import logger

20def remove_duplicate_forms(

21 wxr: WiktextractContext, forms: list[Form]

22) -> list[Form]:

23 """Check for identical forms and remove duplicates."""

24 if not forms:

25 return []

26 new_forms = []

27 for i, form in enumerate(forms):

28 for comp in forms[i + 1 :]:

29 if (

30 form.form == comp.form

31 and form.tags == comp.tags

32 and form.raw_tags == comp.raw_tags

33 ):

34 break

35 # basically "continue" for the outer for block in this case,

36 # but this will not trigger the following else-block

37 else:

38 # No duplicates found in for loop (exited without breaking)

39 new_forms.append(form)

40 if len(forms) > len(new_forms):

41 # wxr.wtp.debug("Found duplicate forms", sortid="simple/pos/32")

42 return new_forms

43 return forms

46ExOrSense = Sense | Example

48IGNORED_GLOSS_TEMPLATES = ("exstub",)

51def parse_gloss(

52 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode]

53) -> bool:

54 """Take what is preferably a line of text and extract tags and a gloss from

55 it. The data is inserted into parent_sense, and for recursion purposes

56 we return a boolean that tells whether there was any gloss text in a

57 lower node."""

58 if len(contents) == 0: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 return False

61 template_tags: list[str] = []

62 found_template = False

63 synonyms: list[Linkage] = []

64 antonyms: list[Linkage] = []

66 # We define this subfunction here to use closure with synonyms and antonyms;

67 # this is the usual way we do it with these kinds of _fn's in the main

68 # extractor. You could also make a wrapper function that takes the

69 # variables you want to enclose and returns a _fn function with those

70 # enclosed, although I don't know if that is more or less efficient;

71 # if you need to use the same _fn code in two places, this is the

72 # way to go.

73 # There's a more detailed explanation about using template_fn in

74 # pronunciation.py.

75 def gloss_template_fn(name: str, ht: TemplateArgs) -> str | None:

76 if name in ("synonyms", "synonym", "syn"):

77 for syn in ht.values(): # ht for 'hashtable'. Tatu comes from C.

78 # The template parameters of `synonyms` is simple: just a list.

79 if not syn:

80 continue

81 synonyms.append(

82 Linkage(

83 word=clean_node(

84 wxr, parent_sense, clean_node(wxr, None, syn)

85 )

86 )

87 )

88 # Returning a string means replacing the 'expansion' that would

89 # have otherwise appeared there with it; `None` leaves things alone.

90 return ""

91 if name in ("antonyms", "antonym", "ant"): 91 ↛ 92line 91 didn't jump to line 92 because the condition on line 91 was never true

92 for ant in ht.values():

93 if not ant:

94 continue

95 antonyms.append(

96 Linkage(

97 word=clean_node(

98 wxr, parent_sense, clean_node(wxr, None, ant)

99 )

100 )

101 )

102 return ""

103

104 if name in IGNORED_GLOSS_TEMPLATES: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 return ""

106

107 # Don't handle other templates here.

108 return None

109

110 for i, tnode in enumerate(contents):

111 if (

112 isinstance(tnode, str)

113 and tnode.strip(STRIP_PUNCTUATION)

114 or not isinstance(tnode, (TemplateNode, str))

115 ):

116 # When we encounter the first naked string that isn't just

117 # whitespace or the first WikiNode that isn't a template.

118 break

119 if isinstance(tnode, TemplateNode):

120 if tnode.template_name == "exstub":

121 parent_sense.raw_tags.append("no-gloss")

122 return False

123 tag_text = clean_node(

124 wxr, parent_sense, tnode, template_fn=gloss_template_fn

125 )

126 if tag_text.endswith((")", "]")): 126 ↛ 135line 126 didn't jump to line 135 because the condition on line 126 was always true

127 # Simple wiktionary is pretty good at making these templates

128 # have brackets

129 tag_text = tag_text.strip(STRIP_PUNCTUATION)

130 if tag_text: 130 ↛ 110line 130 didn't jump to line 110 because the condition on line 130 was always true

131 found_template = True

132 template_tags.append(tag_text)

133 else:

134 # looks like normal text, so probably something {{plural of}}.

135 break

136 # else for the for loop: if we never break

137 else:

138 # If we never break, that means the last item was a tag.

139 i += 1

140

141 if found_template is True:

142 contents = contents[i:]

143

144 text = clean_node(

145 wxr, parent_sense, contents, template_fn=gloss_template_fn

146 )

147

148 if len(synonyms) > 0:

149 parent_sense.synonyms = synonyms

150

151 if len(antonyms) > 0: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 parent_sense.antonyms = antonyms

153

154 if len(template_tags) > 0:

155 parent_sense.raw_tags.extend(template_tags)

156

157 if len(text) > 0:

158 parent_sense.glosses.append(text)

159 return True

160

161 return False

162

163

164def recurse_glosses1(

165 wxr: WiktextractContext,

166 parent_sense: Sense,

167 node: WikiNode,

168) -> list[ExOrSense]:

169 """Helper function for recurse_glosses"""

170 ret: list[ExOrSense] = []

171 found_gloss = False

172

173 if node.kind == NodeKind.LIST:

174 for child in node.children:

175 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 175 ↛ 177line 175 didn't jump to line 177 because the condition on line 175 was never true

176 # This should never happen

177 wxr.wtp.error(

178 f"{child=} is direct child of NodeKind.LIST",

179 sortid="simple/pos/44",

180 )

181 continue

182 ret.extend(

183 recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child)

184 )

185 elif node.kind == NodeKind.LIST_ITEM: 185 ↛ 238line 185 didn't jump to line 238 because the condition on line 185 was always true

186 contents = []

187 sublists = []

188 broke_out = False

189 for i, c in enumerate(node.children):

190 # The contents ends when a new sublist begins.

191 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST:

192 broke_out = True

193 break

194 contents.append(c)

195 if broke_out is True:

196 sublists = node.children[i:]

197

198 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like

199 # `#` or `##:`: the token that appears at the very start of a line that

200 # is used to parse the depth and structure of lists.

201 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"):

202 # This is either a quotation or example.

203 # The `not in` filters out lines that are usually notes or random

204 # stuff not inside gloss lists; see "dare".

205 text = clean_node(

206 wxr, parent_sense, contents

207 ) # clean_node strip()s already so no need to .strip() here.

208 example = Example(text=text)

209 # logger.debug(f"{wxr.wtp.title}/example\n{text}")

210 # We will not bother with subglosses for example entries;

211 # XXX do something about it if it becomes relevant.

212 return [example]

213 elif node.sarg in (":", "*"): 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true

214 wxr.wtp.debug(

215 f"Gloss item line starts with {node.sarg=}.",

216 sortid="simple/pos/214",

217 )

218 return []

219

220 found_gloss = parse_gloss(wxr, parent_sense, contents)

221

222 for sl in sublists:

223 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 223 ↛ 225line 223 didn't jump to line 225 because the condition on line 223 was never true

224 # Should not happen

225 wxr.wtp.error(

226 f"Sublist is not NodeKind.LIST: {sublists=!r}",

227 sortid="simple/pos/82",

228 )

229 continue

230 for r in recurse_glosses1(

231 wxr, parent_sense.model_copy(deep=True), sl

232 ):

233 if isinstance(r, Example):

234 parent_sense.examples.append(r)

235 else:

236 ret.append(r)

237

238 if len(ret) > 0:

239 # the recursion returned actual senses from below, so we will

240 # ignore everything else (incl. any example data that might have

241 # been given to parent_sense) and return that instead.

242 # XXX if this becomes relevant, add the example data to a returned

243 # subsense instead?

244 return ret

245

246 if found_gloss is True or "no-gloss" in parent_sense.raw_tags:

247 return [parent_sense]

248

249 return []

250

251

252def recurse_glosses(

253 wxr: WiktextractContext, node: WikiNode, data: WordEntry

254) -> list[Sense]:

255 """Recurse through WikiNodes to find glosses and sense-related data."""

256 base_sense = Sense()

257 ret = []

258

259 for r in recurse_glosses1(wxr, base_sense, node):

260 if isinstance(r, Example): 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true

261 wxr.wtp.error(

262 f"Example() has bubbled to recurse_glosses: {r.json()}",

263 sortid="simple/pos/glosses",

264 )

265 continue

266 convert_tags_in_sense(r)

267 ret.append(r)

268

269 if len(ret) > 0:

270 return ret

271

272 return []

273

274

275def process_pos(

276 wxr: WiktextractContext,

277 node: WikiNode,

278 data: WordEntry,

279 # the "noun" in "Noun 2"

280 pos_title: str,

281 # the "2" in "Noun 2"

282 pos_num: int = -1,

283) -> WordEntry | None:

284 """Process a part-of-speech section, like 'Noun'. `data` provides basic

285 data common with other POS sections, like pronunciation or etymology."""

286

287 # Metadata for different part-of-speech kinds.

288 pos_meta = POS_HEADINGS[pos_title]

289 data.pos = pos_meta["pos"] # the internal/translated name for the POS

290 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings.

291

292 # Sound data associated with this POS might be coming from a shared

293 # section, in which case we've tried to tag the sound data with its

294 # pos name + number if possible. Filter out stuff that doesn't fit.

295 new_sounds = []

296 for sound in data.sounds: 296 ↛ 297line 296 didn't jump to line 297 because the loop on line 296 never started

297 if len(sound.poses) == 0:

298 # This sound data wasn't tagged with any specific pos section(s), so

299 # we add it to everything; this is basically the default behavior.

300 new_sounds.append(sound)

301 else:

302 for sound_pos in sound.poses:

303 m = POS_ENDING_NUMBER_RE.search(sound_pos)

304 if m is not None:

305 s_num = int(m.group(1).strip())

306 s_pos = sound_pos[: m.start()].strip().lower()

307 else:

308 s_pos = sound_pos.strip().lower()

309 s_num = -1

310 sound_meta = POS_HEADINGS[s_pos]

311 s_pos = sound_meta["pos"]

312 if s_pos == data.pos and s_num == data.pos_num:

313 new_sounds.append(sound)

314 data.sounds = new_sounds

315

316 # Get child nodes except headings (= LEVEL).

317 pos_contents = list(node.invert_find_child(LEVEL_KIND_FLAGS))

318

319 if len(pos_contents) == 0 or (

320 len(pos_contents) == 1

321 and isinstance(pos_contents[0], str)

322 # Just a single newline or whitespace after heading.

323 and not pos_contents[0].strip()

324 ):

325 # Most probably a bad article.

326 wxr.wtp.error(

327 "No body for Part-of-speech section.", sortid="simple/pos/271"

328 )

329 data.senses.append(Sense(tags=["no-gloss"]))

330 return data

331

332 # Check POS templates at the start of the section (Simple English specific).

333 template_tags: list[str] = []

334 template_forms: list[Form] = []

335 head_templates: list[TemplateData] = []

336

337 # Typically, a Wiktionary has a word head before glosses, which contains

338 # the main form of the word (usually same as the title of the article)

339 # and common other forms of the word, plus qualifiers and other data

340 # like that; however, for Simple English Wiktionary the format is to

341 # have a table (or two, if there's variations) containing the word's

342 # conjugation or declension, so we don't have to actually parse the

343 # head here.

344 for i, child in enumerate(pos_contents): 344 ↛ 387line 344 didn't jump to line 387 because the loop on line 344 didn't complete

345 if isinstance(child, str) and not child.strip(): 345 ↛ 347line 345 didn't jump to line 347 because the condition on line 345 was never true

346 # Ignore whitespace

347 continue

348 # TemplateNode is a subclass of WikiNode; not all kinds of nodes have

349 # a subclass, but TemplateNode is handy.

350 if (

351 isinstance(child, TemplateNode)

352 and child.template_name in POS_TEMPLATE_NAMES

353 ):

354 if child.template_name not in pos_meta["templates"]: 354 ↛ 355line 354 didn't jump to line 355 because the condition on line 354 was never true

355 wxr.wtp.debug(

356 f"Template {child.template_name} "

357 f"found under {pos_title}",

358 sortid="simple/pos/93",

359 )

360 elif ttags := pos_meta["templates"][child.template_name]: 360 ↛ 363line 360 didn't jump to line 363 because the condition on line 360 was never true

361 # Some templates have associated tags:

362 # `irrnoun` -> ["irregular"]

363 template_tags.extend(ttags)

364 if forms := parse_pos_table(wxr, child, data): 364 ↛ 367line 364 didn't jump to line 367 because the condition on line 364 was always true

365 template_forms.extend(forms)

366 else:

367 wxr.wtp.warning(

368 f"POS template '{child.template_name}' did "

369 "not have any forms.",

370 sortid="simple/pos/129",

371 )

372 head_templates.append(

373 TemplateData(

374 name=child.template_name,

375 args={

376 str(k): clean_node(wxr, None, v)

377 for k, v in child.template_parameters.items()

378 },

379 expansion="[POS TABLE]"

380 # Clean node returns an empty string for a table.

381 # expansion = clean_node(wxr, None, child)

382 )

383 )

384 else:

385 break

386

387 template_tags = list(set(template_tags))

388 data.forms.extend(template_forms)

389 data.forms = remove_duplicate_forms(wxr, data.forms)

390 data.tags.extend(template_tags)

391 data.head_templates.extend(head_templates)

392

393 # parts = []

394 found_list = False

395 got_senses = False

396 for child in pos_contents[i:]:

397 # Wiktionaries handle glosses the usual way: with numbered lists.

398 # Each list entry is a gloss, sometimes with subglosses, but with

399 # Simple English Wiktionary that seems rare.

400 # logger.debug(f"{child}")

401 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:

402 senses = recurse_glosses(wxr, child, data)

403 found_list = True

404 if len(senses) > 0:

405 got_senses = True

406 data.senses.extend(senses)

407

408 if not got_senses and found_list:

409 wxr.wtp.error(

410 "POS had a list, but the list did not return senses.",

411 sortid="simple/pos/313",

412 )

413

414 # If there is not list, clump everything into one gloss.

415 if not found_list:

416 sense = Sense()

417 found_gloss = parse_gloss(wxr, sense, pos_contents[i:])

418 if found_gloss is True or len(sense.raw_tags) > 0: 418 ↛ 424line 418 didn't jump to line 424 because the condition on line 418 was always true

419 convert_tags_in_sense(sense)

420 if len(sense.glosses) == 0 and "no-gloss" not in sense.tags:

421 sense.tags.append("no-gloss")

422 data.senses.append(sense)

423

424 if len(data.senses) == 0:

425 data.senses.append(Sense(tags=["no-gloss"]))

426

427 return data