Coverage for src/wiktextract/extractor/simple/pos.py: 80%

195 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from wikitextprocessor import NodeKind, TemplateArgs, TemplateNode, WikiNode 

2from wikitextprocessor.parser import LEVEL_KIND_FLAGS 

3 

4from wiktextract import WiktextractContext 

5from wiktextract.page import clean_node 

6 

7from .models import Example, Form, Linkage, Sense, TemplateData, WordEntry 

8from .section_titles import POS_HEADINGS 

9from .table import parse_pos_table 

10from .tags_utils import convert_tags_in_sense 

11from .text_utils import ( 

12 POS_ENDING_NUMBER_RE, 

13 POS_TEMPLATE_NAMES, 

14 STRIP_PUNCTUATION, 

15) 

16 

17# from wiktextract.wxr_logging import logger 

18 

19 

20def remove_duplicate_forms( 

21 wxr: WiktextractContext, forms: list[Form] 

22) -> list[Form]: 

23 """Check for identical forms and remove duplicates.""" 

24 if not forms: 

25 return [] 

26 new_forms = [] 

27 for i, form in enumerate(forms): 

28 for comp in forms[i + 1 :]: 

29 if ( 

30 form.form == comp.form 

31 and form.tags == comp.tags 

32 and form.raw_tags == comp.raw_tags 

33 ): 

34 break 

35 # basically "continue" for the outer for block in this case, 

36 # but this will not trigger the following else-block 

37 else: 

38 # No duplicates found in for loop (exited without breaking) 

39 new_forms.append(form) 

40 if len(forms) > len(new_forms): 

41 # wxr.wtp.debug("Found duplicate forms", sortid="simple/pos/32") 

42 return new_forms 

43 return forms 

44 

45 

46ExOrSense = Sense | Example 

47 

48IGNORED_GLOSS_TEMPLATES = ("exstub",) 

49 

50 

51def parse_gloss( 

52 wxr: WiktextractContext, parent_sense: Sense, contents: list[str | WikiNode] 

53) -> bool: 

54 """Take what is preferably a line of text and extract tags and a gloss from 

55 it. The data is inserted into parent_sense, and for recursion purposes 

56 we return a boolean that tells whether there was any gloss text in a 

57 lower node.""" 

58 if len(contents) == 0: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 return False 

60 

61 template_tags: list[str] = [] 

62 found_template = False 

63 synonyms: list[Linkage] = [] 

64 antonyms: list[Linkage] = [] 

65 

66 # We define this subfunction here to use closure with synonyms and antonyms; 

67 # this is the usual way we do it with these kinds of _fn's in the main 

68 # extractor. You could also make a wrapper function that takes the 

69 # variables you want to enclose and returns a _fn function with those 

70 # enclosed, although I don't know if that is more or less efficient; 

71 # if you need to use the same _fn code in two places, this is the 

72 # way to go. 

73 # There's a more detailed explanation about using template_fn in 

74 # pronunciation.py. 

75 def gloss_template_fn(name: str, ht: TemplateArgs) -> str | None: 

76 if name in ("synonyms", "synonym", "syn"): 

77 for syn in ht.values(): # ht for 'hashtable'. Tatu comes from C. 

78 # The template parameters of `synonyms` is simple: just a list. 

79 if not syn: 

80 continue 

81 synonyms.append( 

82 Linkage( 

83 word=clean_node( 

84 wxr, parent_sense, clean_node(wxr, None, syn) 

85 ) 

86 ) 

87 ) 

88 # Returning a string means replacing the 'expansion' that would 

89 # have otherwise appeared there with it; `None` leaves things alone. 

90 return "" 

91 if name in ("antonyms", "antonym", "ant"): 91 ↛ 92line 91 didn't jump to line 92 because the condition on line 91 was never true

92 for ant in ht.values(): 

93 if not ant: 

94 continue 

95 antonyms.append( 

96 Linkage( 

97 word=clean_node( 

98 wxr, parent_sense, clean_node(wxr, None, ant) 

99 ) 

100 ) 

101 ) 

102 return "" 

103 

104 if name in IGNORED_GLOSS_TEMPLATES: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 return "" 

106 

107 # Don't handle other templates here. 

108 return None 

109 

110 for i, tnode in enumerate(contents): 

111 if ( 

112 isinstance(tnode, str) 

113 and tnode.strip(STRIP_PUNCTUATION) 

114 or not isinstance(tnode, (TemplateNode, str)) 

115 ): 

116 # When we encounter the first naked string that isn't just 

117 # whitespace or the first WikiNode that isn't a template. 

118 break 

119 if isinstance(tnode, TemplateNode): 

120 if tnode.template_name == "exstub": 

121 parent_sense.raw_tags.append("no-gloss") 

122 return False 

123 tag_text = clean_node( 

124 wxr, parent_sense, tnode, template_fn=gloss_template_fn 

125 ) 

126 if tag_text.endswith((")", "]")): 126 ↛ 135line 126 didn't jump to line 135 because the condition on line 126 was always true

127 # Simple wiktionary is pretty good at making these templates 

128 # have brackets 

129 tag_text = tag_text.strip(STRIP_PUNCTUATION) 

130 if tag_text: 130 ↛ 110line 130 didn't jump to line 110 because the condition on line 130 was always true

131 found_template = True 

132 template_tags.append(tag_text) 

133 else: 

134 # looks like normal text, so probably something {{plural of}}. 

135 break 

136 # else for the for loop: if we never break 

137 else: 

138 # If we never break, that means the last item was a tag. 

139 i += 1 

140 

141 if found_template is True: 

142 contents = contents[i:] 

143 

144 text = clean_node( 

145 wxr, parent_sense, contents, template_fn=gloss_template_fn 

146 ) 

147 

148 if len(synonyms) > 0: 

149 parent_sense.synonyms = synonyms 

150 

151 if len(antonyms) > 0: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 parent_sense.antonyms = antonyms 

153 

154 if len(template_tags) > 0: 

155 parent_sense.raw_tags.extend(template_tags) 

156 

157 if len(text) > 0: 

158 parent_sense.glosses.append(text) 

159 return True 

160 

161 return False 

162 

163 

164def recurse_glosses1( 

165 wxr: WiktextractContext, 

166 parent_sense: Sense, 

167 node: WikiNode, 

168) -> list[ExOrSense]: 

169 """Helper function for recurse_glosses""" 

170 ret: list[ExOrSense] = [] 

171 found_gloss = False 

172 

173 if node.kind == NodeKind.LIST: 

174 for child in node.children: 

175 if isinstance(child, str) or child.kind != NodeKind.LIST_ITEM: 175 ↛ 177line 175 didn't jump to line 177 because the condition on line 175 was never true

176 # This should never happen 

177 wxr.wtp.error( 

178 f"{child=} is direct child of NodeKind.LIST", 

179 sortid="simple/pos/44", 

180 ) 

181 continue 

182 ret.extend( 

183 recurse_glosses1(wxr, parent_sense.model_copy(deep=True), child) 

184 ) 

185 elif node.kind == NodeKind.LIST_ITEM: 185 ↛ 238line 185 didn't jump to line 238 because the condition on line 185 was always true

186 contents = [] 

187 sublists = [] 

188 broke_out = False 

189 for i, c in enumerate(node.children): 

190 # The contents ends when a new sublist begins. 

191 if isinstance(c, WikiNode) and c.kind == NodeKind.LIST: 

192 broke_out = True 

193 break 

194 contents.append(c) 

195 if broke_out is True: 

196 sublists = node.children[i:] 

197 

198 # A LIST and LIST_ITEM `sarg` is basically the prefix of the line, like 

199 # `#` or `##:`: the token that appears at the very start of a line that 

200 # is used to parse the depth and structure of lists. 

201 if node.sarg.endswith((":", "*")) and node.sarg not in (":", "*"): 

202 # This is either a quotation or example. 

203 # The `not in` filters out lines that are usually notes or random 

204 # stuff not inside gloss lists; see "dare". 

205 text = clean_node( 

206 wxr, parent_sense, contents 

207 ) # clean_node strip()s already so no need to .strip() here. 

208 example = Example(text=text) 

209 # logger.debug(f"{wxr.wtp.title}/example\n{text}") 

210 # We will not bother with subglosses for example entries; 

211 # XXX do something about it if it becomes relevant. 

212 return [example] 

213 elif node.sarg in (":", "*"): 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true

214 wxr.wtp.debug( 

215 f"Gloss item line starts with {node.sarg=}.", 

216 sortid="simple/pos/214", 

217 ) 

218 return [] 

219 

220 found_gloss = parse_gloss(wxr, parent_sense, contents) 

221 

222 for sl in sublists: 

223 if not (isinstance(sl, WikiNode) and sl.kind == NodeKind.LIST): 223 ↛ 225line 223 didn't jump to line 225 because the condition on line 223 was never true

224 # Should not happen 

225 wxr.wtp.error( 

226 f"Sublist is not NodeKind.LIST: {sublists=!r}", 

227 sortid="simple/pos/82", 

228 ) 

229 continue 

230 for r in recurse_glosses1( 

231 wxr, parent_sense.model_copy(deep=True), sl 

232 ): 

233 if isinstance(r, Example): 

234 parent_sense.examples.append(r) 

235 else: 

236 ret.append(r) 

237 

238 if len(ret) > 0: 

239 # the recursion returned actual senses from below, so we will 

240 # ignore everything else (incl. any example data that might have 

241 # been given to parent_sense) and return that instead. 

242 # XXX if this becomes relevant, add the example data to a returned 

243 # subsense instead? 

244 return ret 

245 

246 if found_gloss is True or "no-gloss" in parent_sense.raw_tags: 

247 return [parent_sense] 

248 

249 return [] 

250 

251 

252def recurse_glosses( 

253 wxr: WiktextractContext, node: WikiNode, data: WordEntry 

254) -> list[Sense]: 

255 """Recurse through WikiNodes to find glosses and sense-related data.""" 

256 base_sense = Sense() 

257 ret = [] 

258 

259 for r in recurse_glosses1(wxr, base_sense, node): 

260 if isinstance(r, Example): 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true

261 wxr.wtp.error( 

262 f"Example() has bubbled to recurse_glosses: {r.json()}", 

263 sortid="simple/pos/glosses", 

264 ) 

265 continue 

266 convert_tags_in_sense(r) 

267 ret.append(r) 

268 

269 if len(ret) > 0: 

270 return ret 

271 

272 return [] 

273 

274 

275def process_pos( 

276 wxr: WiktextractContext, 

277 node: WikiNode, 

278 data: WordEntry, 

279 # the "noun" in "Noun 2" 

280 pos_title: str, 

281 # the "2" in "Noun 2" 

282 pos_num: int = -1, 

283) -> WordEntry | None: 

284 """Process a part-of-speech section, like 'Noun'. `data` provides basic 

285 data common with other POS sections, like pronunciation or etymology.""" 

286 

287 # Metadata for different part-of-speech kinds. 

288 pos_meta = POS_HEADINGS[pos_title] 

289 data.pos = pos_meta["pos"] # the internal/translated name for the POS 

290 data.pos_num = pos_num # SEW uses "Noun 1", "Noun 2" style headings. 

291 

292 # Sound data associated with this POS might be coming from a shared 

293 # section, in which case we've tried to tag the sound data with its 

294 # pos name + number if possible. Filter out stuff that doesn't fit. 

295 new_sounds = [] 

296 for sound in data.sounds: 296 ↛ 297line 296 didn't jump to line 297 because the loop on line 296 never started

297 if len(sound.poses) == 0: 

298 # This sound data wasn't tagged with any specific pos section(s), so 

299 # we add it to everything; this is basically the default behavior. 

300 new_sounds.append(sound) 

301 else: 

302 for sound_pos in sound.poses: 

303 m = POS_ENDING_NUMBER_RE.search(sound_pos) 

304 if m is not None: 

305 s_num = int(m.group(1).strip()) 

306 s_pos = sound_pos[: m.start()].strip().lower() 

307 else: 

308 s_pos = sound_pos.strip().lower() 

309 s_num = -1 

310 sound_meta = POS_HEADINGS[s_pos] 

311 s_pos = sound_meta["pos"] 

312 if s_pos == data.pos and s_num == data.pos_num: 

313 new_sounds.append(sound) 

314 data.sounds = new_sounds 

315 

316 # Get child nodes except headings (= LEVEL). 

317 pos_contents = list(node.invert_find_child(LEVEL_KIND_FLAGS)) 

318 

319 if len(pos_contents) == 0 or ( 

320 len(pos_contents) == 1 

321 and isinstance(pos_contents[0], str) 

322 # Just a single newline or whitespace after heading. 

323 and not pos_contents[0].strip() 

324 ): 

325 # Most probably a bad article. 

326 wxr.wtp.error( 

327 "No body for Part-of-speech section.", sortid="simple/pos/271" 

328 ) 

329 data.senses.append(Sense(tags=["no-gloss"])) 

330 return data 

331 

332 # Check POS templates at the start of the section (Simple English specific). 

333 template_tags: list[str] = [] 

334 template_forms: list[Form] = [] 

335 head_templates: list[TemplateData] = [] 

336 

337 # Typically, a Wiktionary has a word head before glosses, which contains 

338 # the main form of the word (usually same as the title of the article) 

339 # and common other forms of the word, plus qualifiers and other data 

340 # like that; however, for Simple English Wiktionary the format is to 

341 # have a table (or two, if there's variations) containing the word's 

342 # conjugation or declension, so we don't have to actually parse the 

343 # head here. 

344 for i, child in enumerate(pos_contents): 344 ↛ 387line 344 didn't jump to line 387 because the loop on line 344 didn't complete

345 if isinstance(child, str) and not child.strip(): 345 ↛ 347line 345 didn't jump to line 347 because the condition on line 345 was never true

346 # Ignore whitespace 

347 continue 

348 # TemplateNode is a subclass of WikiNode; not all kinds of nodes have 

349 # a subclass, but TemplateNode is handy. 

350 if ( 

351 isinstance(child, TemplateNode) 

352 and child.template_name in POS_TEMPLATE_NAMES 

353 ): 

354 if child.template_name not in pos_meta["templates"]: 354 ↛ 355line 354 didn't jump to line 355 because the condition on line 354 was never true

355 wxr.wtp.debug( 

356 f"Template {child.template_name} " 

357 f"found under {pos_title}", 

358 sortid="simple/pos/93", 

359 ) 

360 elif ttags := pos_meta["templates"][child.template_name]: 360 ↛ 363line 360 didn't jump to line 363 because the condition on line 360 was never true

361 # Some templates have associated tags: 

362 # `irrnoun` -> ["irregular"] 

363 template_tags.extend(ttags) 

364 if forms := parse_pos_table(wxr, child, data): 364 ↛ 367line 364 didn't jump to line 367 because the condition on line 364 was always true

365 template_forms.extend(forms) 

366 else: 

367 wxr.wtp.warning( 

368 f"POS template '{child.template_name}' did " 

369 "not have any forms.", 

370 sortid="simple/pos/129", 

371 ) 

372 head_templates.append( 

373 TemplateData( 

374 name=child.template_name, 

375 args={ 

376 str(k): clean_node(wxr, None, v) 

377 for k, v in child.template_parameters.items() 

378 }, 

379 expansion="[POS TABLE]" 

380 # Clean node returns an empty string for a table. 

381 # expansion = clean_node(wxr, None, child) 

382 ) 

383 ) 

384 else: 

385 break 

386 

387 template_tags = list(set(template_tags)) 

388 data.forms.extend(template_forms) 

389 data.forms = remove_duplicate_forms(wxr, data.forms) 

390 data.tags.extend(template_tags) 

391 data.head_templates.extend(head_templates) 

392 

393 # parts = [] 

394 found_list = False 

395 got_senses = False 

396 for child in pos_contents[i:]: 

397 # Wiktionaries handle glosses the usual way: with numbered lists. 

398 # Each list entry is a gloss, sometimes with subglosses, but with 

399 # Simple English Wiktionary that seems rare. 

400 # logger.debug(f"{child}") 

401 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

402 senses = recurse_glosses(wxr, child, data) 

403 found_list = True 

404 if len(senses) > 0: 

405 got_senses = True 

406 data.senses.extend(senses) 

407 

408 if not got_senses and found_list: 

409 wxr.wtp.error( 

410 "POS had a list, but the list did not return senses.", 

411 sortid="simple/pos/313", 

412 ) 

413 

414 # If there is not list, clump everything into one gloss. 

415 if not found_list: 

416 sense = Sense() 

417 found_gloss = parse_gloss(wxr, sense, pos_contents[i:]) 

418 if found_gloss is True or len(sense.raw_tags) > 0: 418 ↛ 424line 418 didn't jump to line 424 because the condition on line 418 was always true

419 convert_tags_in_sense(sense) 

420 if len(sense.glosses) == 0 and "no-gloss" not in sense.tags: 

421 sense.tags.append("no-gloss") 

422 data.senses.append(sense) 

423 

424 if len(data.senses) == 0: 

425 data.senses.append(Sense(tags=["no-gloss"])) 

426 

427 return data