Coverage for src/wiktextract/extractor/el/head.py: 72%

166 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2 

3from mediawiki_langcodes import code_to_name 

4 

5from unicodedata import name as unicode_name 

6 

7from wiktextract.extractor.en.form_descriptions import distw 

8from wiktextract.wxr_context import WiktextractContext 

9from wiktextract.wxr_logging import logger 

10 

11from .models import Form, WordEntry 

12 

13BOLD_RE = re.compile(r"(__/?[BIL]__|\(|\)|, |\. |: )") 

14 

15 

16def parse_head(wxr: WiktextractContext, pos_data: WordEntry, text: str) -> bool: 

17 split_text = BOLD_RE.split(text) 

18 # print(split_text) 

19 

20 if not split_text[0] == "": 

21 # This should always be True; maybe an assert? 

22 # Turns out *some* articles add `-` before the template, like funa... 

23 if split_text[0] in ("-", "το "): 23 ↛ 31line 23 didn't jump to line 31 because the condition on line 23 was always true

24 if len(split_text) > 3: 24 ↛ 29line 24 didn't jump to line 29 because the condition on line 24 was always true

25 # Just throw the prefix into the (probably) bolded text 

26 split_text[2] = split_text[0] + split_text[2] 

27 split_text[0] = '' 

28 else: 

29 return False 

30 else: 

31 return False 

32 

33 forms: list[Form] = [] 

34 # print_blocks = [] 

35 

36 for form_ret in partition_head_forms(wxr, split_text): 

37 # print_blocks.append(form_block) 

38 # logger.info(f"\n §§ {form_ret}") 

39 forms.append(form_ret) 

40 

41 # logger.info( 

42 # f"\n §§ {wxr.wtp.title} -> {''.join(split_text)}\n § " 

43 # + "\n § ".join(f"{''.join(pb)}" for pb in print_blocks) 

44 # ) 

45 if len(forms) == 0: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 return False 

47 

48 pos_data.forms = forms 

49 

50 return True 

51 

52 

53# Sometimes bolded sections of the head are just smooshed together; what 

54# I've seen, it's "form -a -b -c", that is, suffixes. 

55SUFFIXES_RE = re.compile(r"\s+(-\w+)\b") 

56 

57 

58def partition_head_forms( 

59 wxr: WiktextractContext, split_text: list[str] 

60) -> list[Form]: 

61 if len(split_text) < 2: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 wxr.wtp.error( 

63 f"Failed to partition head forms; " f"too few items {split_text=}", 

64 sortid="head/50/20250303", 

65 ) 

66 return [] 

67 

68 Forms = list[str] 

69 Tags = list[str] 

70 blocks: list[tuple[Forms, Tags]] = [([], [])] 

71 current_forms: Forms = [] 

72 current_tags: Tags = [] 

73 

74 def push_new_block(): 

75 nonlocal current_forms 

76 nonlocal current_tags 

77 blocks.append((current_forms, current_tags)) 

78 current_forms = [] 

79 current_tags = [] 

80 

81 def extend_old_block(): 

82 nonlocal current_forms 

83 nonlocal current_tags 

84 blocks[-1][0].extend(current_forms) 

85 blocks[-1][1].extend(current_tags) 

86 current_forms = [] 

87 current_tags = [] 

88 

89 seen_italics = False 

90 inside_parens = False 

91 inside_bold = False 

92 inside_link = False 

93 inside_italics = False 

94 

95 previous_token_was_period = False 

96 for i, t in enumerate(split_text): 

97 # print(f"{i}: {t=}") 

98 t2 = t.strip() 

99 if not t2 and t and previous_token_was_period: 99 ↛ 102line 99 didn't jump to line 102 because the condition on line 99 was never true

100 # Whitespace 

101 # print("Prev. was dot") 

102 previous_token_was_period = False 

103 push_new_block() 

104 continue 

105 t = t2 

106 if i % 2 == 0: 

107 previous_token_was_period = False 

108 if t in ("", "ή"): 

109 continue 

110 elif t in ("και", "&", ":", ".:"): 

111 push_new_block() 

112 continue 

113 

114 if i % 2 == 0: 

115 # Odd elements: text 

116 

117 if t == ".": 

118 previous_token_was_period = True 

119 continue 

120 

121 # Check if word is not in greek; if it's not, that's a form. 

122 # XXX this might be problematic if there's a stretch of unbolded 

123 # text where the bolding has just been forgotten. Fix by 

124 # checking each word for greekness? 

125 # This doesn't need to check if the language we're processing 

126 # is greek or not, because all non-greek words are 'forms'. 

127 found_language_code = False 

128 is_foreign_script = False 

129 for ch in t: 

130 if not ch.isalpha(): 

131 continue 

132 if not unicode_name(ch).startswith("GREEK"): 

133 if code_to_name(t) != "": 

134 found_language_code = True 

135 break 

136 is_foreign_script = True 

137 break 

138 

139 if found_language_code: 

140 break 

141 

142 if inside_italics: 

143 # Italicized words should always be tags 

144 current_tags.append(t) 

145 continue 

146 if is_foreign_script: 

147 current_forms.append(t) 

148 continue 

149 if inside_bold: 149 ↛ 160line 149 didn't jump to line 160 because the condition on line 149 was always true

150 # Bolded words should always be forms 

151 # Split off any suffixes inside the same bold node. 

152 suffixes = SUFFIXES_RE.split(t) 

153 for f in suffixes: 

154 f = f.strip() 

155 if not f: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 continue 

157 current_forms.append(f) 

158 continue 

159 

160 if inside_link or not ( 

161 inside_link or inside_italics or inside_bold 

162 ): 

163 # Usually a form, sometimes a tag... 

164 # XXX handle titles with whitespace by doing the splitting 

165 # in N steps: there's one space, split A B C D with 

166 # A B, C D and (A), B C, (D) 

167 

168 if seen_italics: # there has been text in italics before 

169 current_forms.append(t) 

170 continue 

171 words = t.split() 

172 orig_words = (wxr.wtp.title or "").split() 

173 

174 if len(words) < len(orig_words): 

175 # The phrase we're looking at it shorter than the article 

176 # title in words; unlikely that a form like this loses 

177 # words (more likely to add words) so consider this a 

178 # tag: XXX if this turns out to be problematic 

179 current_tags.append(t) 

180 continue 

181 

182 matches = 0 

183 

184 for word in words: 

185 if distw(orig_words, word) < 0.4: 

186 matches += 1 

187 break 

188 

189 if matches > 0: # XXX use better heuristic; problem is that 

190 # percentage-wise, if you add two words to 

191 # one word then the percentage needed is low 

192 # to match it. 

193 current_forms.append(t) 

194 continue 

195 

196 current_tags.append(t) 

197 continue 

198 

199 continue 

200 

201 # Even elements: splitter tokens like commas, parens or formatting 

202 match t: 

203 case "(": 

204 if current_forms and current_tags: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 push_new_block() 

206 else: 

207 extend_old_block() 

208 # We don't support nested parens; XXX if there's a problem 

209 # with them 

210 inside_parens = True 

211 case ")": 

212 inside_parens = False 

213 # print(f"{current_forms=}, {current_tags=}, {t=}") 

214 if ( 214 ↛ 222line 214 didn't jump to line 222 because the condition on line 214 was never true

215 not current_forms 

216 and len(current_tags) == 1 

217 and code_to_name(current_tags[0]) != "" 

218 ): 

219 # There are a lot of `(en)` language code tags that we 

220 # don't care about because they're just repeating the 

221 # language code of the word entry itself! 

222 current_tags = [] 

223 continue 

224 if current_forms and current_tags: 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true

225 push_new_block() 

226 else: 

227 extend_old_block() 

228 case ",": 

229 if not inside_parens: 229 ↛ 96line 229 didn't jump to line 96 because the condition on line 229 was always true

230 if current_forms and current_tags: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true

231 push_new_block() 

232 else: 

233 extend_old_block() 

234 case ":": 234 ↛ 235line 234 didn't jump to line 235 because the pattern on line 234 never matched

235 if not inside_parens: 

236 # Do not append to previous. `:` should, logically, 

237 # always point forward 

238 push_new_block() 

239 case ".": 239 ↛ 240line 239 didn't jump to line 240 because the pattern on line 239 never matched

240 if not inside_parens: 

241 push_new_block() 

242 case "__B__": 

243 # print(f"{current_forms=}, {current_tags=}") 

244 if current_forms and current_tags: 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true

245 push_new_block() 

246 else: 

247 extend_old_block() 

248 inside_bold = True 

249 case "__/B__": 

250 inside_bold = False 

251 case "__L__": 

252 inside_link = True 

253 case "__/L__": 

254 inside_link = False 

255 case "__I__": 

256 seen_italics = True 

257 inside_italics = True 

258 case "__/I__": 258 ↛ 260line 258 didn't jump to line 260 because the pattern on line 258 always matched

259 inside_italics = False 

260 case _: 

261 pass 

262 # print(f"{t=}, {blocks=}") 

263 if len(current_forms) > 0 and len(current_tags) > 0: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true

264 push_new_block() 

265 else: 

266 extend_old_block() 

267 

268 ret: list[Form] = [] 

269 

270 for forms, tags in blocks: 

271 # print(f"{forms=}, {tags=}") 

272 tags = list(set(tags)) 

273 for form in forms: 

274 ret.append(Form(form=form, raw_tags=tags)) 

275 

276 return ret