Coverage for src/wiktextract/extractor/el/head.py: 77%

165 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2from unicodedata import name as unicode_name 

3 

4from mediawiki_langcodes import code_to_name 

5 

6from wiktextract.clean import clean_value 

7from wiktextract.extractor.en.form_descriptions import distw 

8from wiktextract.wxr_context import WiktextractContext 

9 

10from .models import Form 

11 

12BOLD_RE = re.compile(r"(__/?[BIL]__|\(|\)|, |\. |: )") 

13 

14 

15def parse_head(wxr: WiktextractContext, text: str) -> list[Form]: 

16 text = clean_value(wxr, text) 

17 split_text = BOLD_RE.split(text) 

18 # print(split_text) 

19 

20 if not split_text[0] == "": 

21 # This should always be True; maybe an assert? 

22 # Turns out *some* articles add `-` before the template, like funa... 

23 if split_text[0] in ("-", "το "): 23 ↛ 31line 23 didn't jump to line 31 because the condition on line 23 was always true

24 if len(split_text) > 3: 24 ↛ 29line 24 didn't jump to line 29 because the condition on line 24 was always true

25 # Just throw the prefix into the (probably) bolded text 

26 split_text[2] = split_text[0] + split_text[2] 

27 split_text[0] = "" 

28 else: 

29 return [] 

30 else: 

31 return [] 

32 

33 forms: list[Form] = [] 

34 # print_blocks = [] 

35 

36 for form_ret in partition_head_forms(wxr, split_text): 

37 # print_blocks.append(form_block) 

38 # logger.info(f"\n §§ {form_ret}") 

39 forms.append(form_ret) 

40 

41 # logger.info( 

42 # f"\n §§ {wxr.wtp.title} -> {''.join(split_text)}\n § " 

43 # + "\n § ".join(f"{''.join(pb)}" for pb in print_blocks) 

44 # ) 

45 return forms 

46 

47 

48# Sometimes bolded sections of the head are just smooshed together; what 

49# I've seen, it's "form -a -b -c", that is, suffixes. 

50SUFFIXES_RE = re.compile(r"\s+(-\w+)\b") 

51 

52 

53def partition_head_forms( 

54 wxr: WiktextractContext, split_text: list[str] 

55) -> list[Form]: 

56 if len(split_text) < 2: 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 wxr.wtp.error( 

58 f"Failed to partition head forms; too few items {split_text=}", 

59 sortid="head/50/20250303", 

60 ) 

61 return [] 

62 

63 Forms = list[str] 

64 Tags = list[str] 

65 blocks: list[tuple[Forms, Tags]] = [([], [])] 

66 current_forms: Forms = [] 

67 current_tags: Tags = [] 

68 

69 def push_new_block() -> None: 

70 nonlocal current_forms 

71 nonlocal current_tags 

72 blocks.append((current_forms, current_tags)) 

73 current_forms = [] 

74 current_tags = [] 

75 

76 def extend_old_block() -> None: 

77 nonlocal current_forms 

78 nonlocal current_tags 

79 blocks[-1][0].extend(current_forms) 

80 blocks[-1][1].extend(current_tags) 

81 current_forms = [] 

82 current_tags = [] 

83 

84 seen_italics = "__I__" in split_text 

85 seen_bold = "__B__" in split_text 

86 inside_parens = False 

87 inside_bold = False 

88 inside_link = False 

89 inside_italics = False 

90 

91 previous_token_was_period = False 

92 for i, t in enumerate(split_text): 

93 # print(f"{i}: {t=}") 

94 # print(f"{current_forms=}, {current_tags=}. Now: {t=}") 

95 t2 = t.strip() 

96 if not t2 and t and previous_token_was_period: 96 ↛ 99line 96 didn't jump to line 99 because the condition on line 96 was never true

97 # Whitespace 

98 # print("Prev. was dot") 

99 previous_token_was_period = False 

100 push_new_block() 

101 continue 

102 t = t2 

103 if i % 2 == 0: 

104 previous_token_was_period = False 

105 if t in ("", "ή"): 

106 continue 

107 elif t in ("και", "&", ":", ".:"): 

108 push_new_block() 

109 continue 

110 

111 if i % 2 == 0: 

112 # Odd elements: text 

113 

114 if t == ".": 

115 previous_token_was_period = True 

116 continue 

117 

118 # Check if word is not in greek; if it's not, that's a form. 

119 # XXX this might be problematic if there's a stretch of unbolded 

120 # text where the bolding has just been forgotten. Fix by 

121 # checking each word for greekness? 

122 # This doesn't need to check if the language we're processing 

123 # is greek or not, because all non-greek words are 'forms'. 

124 found_language_code = False 

125 is_foreign_script = False 

126 for ch in t: 

127 if not ch.isalpha(): 

128 continue 

129 if not unicode_name(ch).startswith("GREEK"): 

130 if code_to_name(t) != "": 

131 found_language_code = True 

132 break 

133 is_foreign_script = True 

134 break 

135 

136 if found_language_code: 

137 break 

138 

139 if inside_italics: 

140 # Italicized words should always be tags 

141 current_tags.append(t) 

142 continue 

143 if is_foreign_script: 

144 current_forms.append(t) 

145 continue 

146 if inside_bold: 

147 # Bolded words should always be forms 

148 # Split off any suffixes inside the same bold node. 

149 suffixes = SUFFIXES_RE.split(t) 

150 for f in suffixes: 

151 f = f.strip() 

152 if not f: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true

153 continue 

154 current_forms.append(f) 

155 # print(f"inside_bold {t=} {current_forms=}") 

156 continue 

157 

158 if inside_link or ( 158 ↛ 166line 158 didn't jump to line 166 because the condition on line 158 was never true

159 not inside_italics and not inside_bold and not seen_bold 

160 ): 

161 # Usually a form, sometimes a tag... 

162 # XXX handle titles with whitespace by doing the splitting 

163 # in N steps: there's one space, split A B C D with 

164 # A B, C D and (A), B C, (D) 

165 

166 if ( 

167 seen_italics and not seen_bold 

168 ): # there has been text in italics before 

169 current_forms.append(t) 

170 continue 

171 words = t.split() 

172 orig_words = (wxr.wtp.title or "").split() 

173 

174 if len(words) < len(orig_words): 

175 # The phrase we're looking at it shorter than the article 

176 # title in words; unlikely that a form like this loses 

177 # words (more likely to add words) so consider this a 

178 # tag: XXX if this turns out to be problematic 

179 current_tags.append(t) 

180 continue 

181 

182 matches = 0 

183 

184 for word in words: 

185 if distw(orig_words, word) < 0.4: 

186 matches += 1 

187 break 

188 

189 if matches > 0: # XXX use better heuristic; problem is that 

190 # percentage-wise, if you add two words to 

191 # one word then the percentage needed is low 

192 # to match it. 

193 current_forms.append(t) 

194 continue 

195 

196 current_tags.append(t) 

197 continue 

198 

199 continue 

200 

201 # Even elements: splitter tokens like commas, parens or formatting 

202 match t: 

203 case "(": 

204 if current_forms and current_tags: 

205 push_new_block() 

206 else: 

207 extend_old_block() 

208 # We don't support nested parens; XXX if there's a problem 

209 # with them 

210 inside_parens = True 

211 case ")": 

212 inside_parens = False 

213 # print(f"{current_forms=}, {current_tags=}, {t=}") 

214 if ( 214 ↛ 222line 214 didn't jump to line 222 because the condition on line 214 was never true

215 not current_forms 

216 and len(current_tags) == 1 

217 and code_to_name(current_tags[0]) != "" 

218 ): 

219 # There are a lot of `(en)` language code tags that we 

220 # don't care about because they're just repeating the 

221 # language code of the word entry itself! 

222 current_tags = [] 

223 continue 

224 if current_forms and current_tags: 

225 push_new_block() 

226 else: 

227 extend_old_block() 

228 case ",": 

229 if not inside_parens: 229 ↛ 92line 229 didn't jump to line 92 because the condition on line 229 was always true

230 if current_forms and current_tags: 

231 push_new_block() 

232 else: 

233 extend_old_block() 

234 case ":": 234 ↛ 235line 234 didn't jump to line 235 because the pattern on line 234 never matched

235 if not inside_parens: 

236 # Do not append to previous. `:` should, logically, 

237 # always point forward 

238 push_new_block() 

239 case ".": 239 ↛ 240line 239 didn't jump to line 240 because the pattern on line 239 never matched

240 if not inside_parens: 

241 push_new_block() 

242 case "__B__": 

243 # print(f"{current_forms=}, {current_tags=}") 

244 if not inside_parens and current_forms and current_tags: 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true

245 push_new_block() 

246 elif not inside_parens: 

247 extend_old_block() 

248 inside_bold = True 

249 case "__/B__": 

250 inside_bold = False 

251 case "__L__": 

252 inside_link = True 

253 case "__/L__": 

254 inside_link = False 

255 case "__I__": 

256 inside_italics = True 

257 case "__/I__": 257 ↛ 259line 257 didn't jump to line 259 because the pattern on line 257 always matched

258 inside_italics = False 

259 case _: 

260 pass 

261 # print(f"{t=}, {blocks=}") 

262 if len(current_forms) > 0 and len(current_tags) > 0: 

263 push_new_block() 

264 else: 

265 extend_old_block() 

266 

267 ret: list[Form] = [] 

268 

269 for forms, raw_tags in blocks: 

270 # print(f"{forms=}, {raw_tags=}") 

271 raw_tags = sorted(set(raw_tags)) 

272 for form in forms: 

273 ret.append( 

274 Form( 

275 form=form, 

276 raw_tags=raw_tags, 

277 source="header", 

278 ) 

279 ) 

280 

281 return ret