Coverage for src/wiktextract/extractor/el/head.py: 74%

165 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from unicodedata import name as unicode_name 

3 

4from mediawiki_langcodes import code_to_name 

5 

6from wiktextract.extractor.en.form_descriptions import distw 

7from wiktextract.wxr_context import WiktextractContext 

8 

9from .models import Form, WordEntry 

10 

11BOLD_RE = re.compile(r"(__/?[BIL]__|\(|\)|, |\. |: )") 

12 

13 

14def parse_head(wxr: WiktextractContext, pos_data: WordEntry, text: str) -> bool: 

15 split_text = BOLD_RE.split(text) 

16 # print(split_text) 

17 

18 if not split_text[0] == "": 

19 # This should always be True; maybe an assert? 

20 # Turns out *some* articles add `-` before the template, like funa... 

21 if split_text[0] in ("-", "το "): 21 ↛ 29line 21 didn't jump to line 29 because the condition on line 21 was always true

22 if len(split_text) > 3: 22 ↛ 27line 22 didn't jump to line 27 because the condition on line 22 was always true

23 # Just throw the prefix into the (probably) bolded text 

24 split_text[2] = split_text[0] + split_text[2] 

25 split_text[0] = "" 

26 else: 

27 return False 

28 else: 

29 return False 

30 

31 forms: list[Form] = [] 

32 # print_blocks = [] 

33 

34 for form_ret in partition_head_forms(wxr, split_text): 

35 # print_blocks.append(form_block) 

36 # logger.info(f"\n §§ {form_ret}") 

37 forms.append(form_ret) 

38 

39 # logger.info( 

40 # f"\n §§ {wxr.wtp.title} -> {''.join(split_text)}\n § " 

41 # + "\n § ".join(f"{''.join(pb)}" for pb in print_blocks) 

42 # ) 

43 if len(forms) == 0: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 return False 

45 

46 pos_data.forms = forms 

47 

48 return True 

49 

50 

51# Sometimes bolded sections of the head are just smooshed together; what 

52# I've seen, it's "form -a -b -c", that is, suffixes. 

53SUFFIXES_RE = re.compile(r"\s+(-\w+)\b") 

54 

55 

56def partition_head_forms( 

57 wxr: WiktextractContext, split_text: list[str] 

58) -> list[Form]: 

59 if len(split_text) < 2: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 wxr.wtp.error( 

61 f"Failed to partition head forms; " f"too few items {split_text=}", 

62 sortid="head/50/20250303", 

63 ) 

64 return [] 

65 

66 Forms = list[str] 

67 Tags = list[str] 

68 blocks: list[tuple[Forms, Tags]] = [([], [])] 

69 current_forms: Forms = [] 

70 current_tags: Tags = [] 

71 

72 def push_new_block(): 

73 nonlocal current_forms 

74 nonlocal current_tags 

75 blocks.append((current_forms, current_tags)) 

76 current_forms = [] 

77 current_tags = [] 

78 

79 def extend_old_block(): 

80 nonlocal current_forms 

81 nonlocal current_tags 

82 blocks[-1][0].extend(current_forms) 

83 blocks[-1][1].extend(current_tags) 

84 current_forms = [] 

85 current_tags = [] 

86 

87 seen_italics = "__I__" in split_text 

88 seen_bold = "__B__" in split_text 

89 inside_parens = False 

90 inside_bold = False 

91 inside_link = False 

92 inside_italics = False 

93 

94 previous_token_was_period = False 

95 for i, t in enumerate(split_text): 

96 # print(f"{i}: {t=}") 

97 t2 = t.strip() 

98 if not t2 and t and previous_token_was_period: 98 ↛ 101line 98 didn't jump to line 101 because the condition on line 98 was never true

99 # Whitespace 

100 # print("Prev. was dot") 

101 previous_token_was_period = False 

102 push_new_block() 

103 continue 

104 t = t2 

105 if i % 2 == 0: 

106 previous_token_was_period = False 

107 if t in ("", "ή"): 

108 continue 

109 elif t in ("και", "&", ":", ".:"): 

110 push_new_block() 

111 continue 

112 

113 if i % 2 == 0: 

114 # Odd elements: text 

115 

116 if t == ".": 

117 previous_token_was_period = True 

118 continue 

119 

120 # Check if word is not in greek; if it's not, that's a form. 

121 # XXX this might be problematic if there's a stretch of unbolded 

122 # text where the bolding has just been forgotten. Fix by 

123 # checking each word for greekness? 

124 # This doesn't need to check if the language we're processing 

125 # is greek or not, because all non-greek words are 'forms'. 

126 found_language_code = False 

127 is_foreign_script = False 

128 for ch in t: 

129 if not ch.isalpha(): 

130 continue 

131 if not unicode_name(ch).startswith("GREEK"): 

132 if code_to_name(t) != "": 

133 found_language_code = True 

134 break 

135 is_foreign_script = True 

136 break 

137 

138 if found_language_code: 

139 break 

140 

141 if inside_italics: 

142 # Italicized words should always be tags 

143 current_tags.append(t) 

144 continue 

145 if is_foreign_script: 

146 current_forms.append(t) 

147 continue 

148 if inside_bold: 

149 # Bolded words should always be forms 

150 # Split off any suffixes inside the same bold node. 

151 suffixes = SUFFIXES_RE.split(t) 

152 for f in suffixes: 

153 f = f.strip() 

154 if not f: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 continue 

156 current_forms.append(f) 

157 continue 

158 

159 if inside_link or ( 159 ↛ 167line 159 didn't jump to line 167 because the condition on line 159 was never true

160 not inside_italics and not inside_bold and not seen_bold 

161 ): 

162 # Usually a form, sometimes a tag... 

163 # XXX handle titles with whitespace by doing the splitting 

164 # in N steps: there's one space, split A B C D with 

165 # A B, C D and (A), B C, (D) 

166 

167 if ( 

168 seen_italics and not seen_bold 

169 ): # there has been text in italics before 

170 current_forms.append(t) 

171 continue 

172 words = t.split() 

173 orig_words = (wxr.wtp.title or "").split() 

174 

175 if len(words) < len(orig_words): 

176 # The phrase we're looking at it shorter than the article 

177 # title in words; unlikely that a form like this loses 

178 # words (more likely to add words) so consider this a 

179 # tag: XXX if this turns out to be problematic 

180 current_tags.append(t) 

181 continue 

182 

183 matches = 0 

184 

185 for word in words: 

186 if distw(orig_words, word) < 0.4: 

187 matches += 1 

188 break 

189 

190 if matches > 0: # XXX use better heuristic; problem is that 

191 # percentage-wise, if you add two words to 

192 # one word then the percentage needed is low 

193 # to match it. 

194 current_forms.append(t) 

195 continue 

196 

197 current_tags.append(t) 

198 continue 

199 

200 continue 

201 

202 # Even elements: splitter tokens like commas, parens or formatting 

203 match t: 

204 case "(": 

205 if current_forms and current_tags: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 push_new_block() 

207 else: 

208 extend_old_block() 

209 # We don't support nested parens; XXX if there's a problem 

210 # with them 

211 inside_parens = True 

212 case ")": 

213 inside_parens = False 

214 # print(f"{current_forms=}, {current_tags=}, {t=}") 

215 if ( 215 ↛ 223line 215 didn't jump to line 223 because the condition on line 215 was never true

216 not current_forms 

217 and len(current_tags) == 1 

218 and code_to_name(current_tags[0]) != "" 

219 ): 

220 # There are a lot of `(en)` language code tags that we 

221 # don't care about because they're just repeating the 

222 # language code of the word entry itself! 

223 current_tags = [] 

224 continue 

225 if current_forms and current_tags: 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true

226 push_new_block() 

227 else: 

228 extend_old_block() 

229 case ",": 

230 if not inside_parens: 230 ↛ 95line 230 didn't jump to line 95 because the condition on line 230 was always true

231 if current_forms and current_tags: 

232 push_new_block() 

233 else: 

234 extend_old_block() 

235 case ":": 235 ↛ 236line 235 didn't jump to line 236 because the pattern on line 235 never matched

236 if not inside_parens: 

237 # Do not append to previous. `:` should, logically, 

238 # always point forward 

239 push_new_block() 

240 case ".": 240 ↛ 241line 240 didn't jump to line 241 because the pattern on line 240 never matched

241 if not inside_parens: 

242 push_new_block() 

243 case "__B__": 

244 # print(f"{current_forms=}, {current_tags=}") 

245 if current_forms and current_tags: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 push_new_block() 

247 else: 

248 extend_old_block() 

249 inside_bold = True 

250 case "__/B__": 

251 inside_bold = False 

252 case "__L__": 

253 inside_link = True 

254 case "__/L__": 

255 inside_link = False 

256 case "__I__": 

257 inside_italics = True 

258 case "__/I__": 258 ↛ 260line 258 didn't jump to line 260 because the pattern on line 258 always matched

259 inside_italics = False 

260 case _: 

261 pass 

262 # print(f"{t=}, {blocks=}") 

263 if len(current_forms) > 0 and len(current_tags) > 0: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true

264 push_new_block() 

265 else: 

266 extend_old_block() 

267 

268 ret: list[Form] = [] 

269 

270 for forms, tags in blocks: 

271 # print(f"{forms=}, {tags=}") 

272 tags = sorted(set(tags)) 

273 

274 for form in forms: 

275 ret.append(Form(form=form, raw_tags=tags)) 

276 

277 return ret