Coverage for src / wiktextract / extractor / el / head.py: 77%

162 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from unicodedata import name as unicode_name 

3 

4from mediawiki_langcodes import code_to_name 

5 

6from wiktextract.clean import clean_value 

7from wiktextract.extractor.en.form_descriptions import distw 

8from wiktextract.wxr_context import WiktextractContext 

9 

10from .models import Form 

11 

12BOLD_RE = re.compile(r"(__/?[BIL]__|\(|\)|, |\. |: )") 

13 

14 

15def parse_head(wxr: WiktextractContext, text: str) -> list[Form]: 

16 text = clean_value(wxr, text) 

17 split_text = BOLD_RE.split(text) 

18 # print(split_text) 

19 

20 if not split_text[0] == "": 

21 # This should always be True; maybe an assert? 

22 # Turns out *some* articles add `-` before the template, like funa... 

23 if split_text[0] in ("-", "το "): 23 ↛ 31line 23 didn't jump to line 31 because the condition on line 23 was always true

24 if len(split_text) > 3: 24 ↛ 29line 24 didn't jump to line 29 because the condition on line 24 was always true

25 # Just throw the prefix into the (probably) bolded text 

26 split_text[2] = split_text[0] + split_text[2] 

27 split_text[0] = "" 

28 else: 

29 return [] 

30 else: 

31 return [] 

32 

33 return partition_head_forms(wxr, split_text) 

34 

35 

36# Sometimes bolded sections of the head are just smooshed together; what 

37# I've seen, it's "form -a -b -c", that is, suffixes. 

38SUFFIXES_RE = re.compile(r"\s+(-\w+)\b") 

39 

40 

41def partition_head_forms( 

42 wxr: WiktextractContext, split_text: list[str] 

43) -> list[Form]: 

44 if len(split_text) < 2: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 wxr.wtp.error( 

46 f"Failed to partition head forms; too few items {split_text=}", 

47 sortid="head/50/20250303", 

48 ) 

49 return [] 

50 

51 Forms = list[str] 

52 Tags = list[str] 

53 blocks: list[tuple[Forms, Tags]] = [([], [])] 

54 current_forms: Forms = [] 

55 current_tags: Tags = [] 

56 

57 def push_new_block() -> None: 

58 nonlocal current_forms 

59 nonlocal current_tags 

60 blocks.append((current_forms, current_tags)) 

61 current_forms = [] 

62 current_tags = [] 

63 

64 def extend_old_block() -> None: 

65 nonlocal current_forms 

66 nonlocal current_tags 

67 blocks[-1][0].extend(current_forms) 

68 blocks[-1][1].extend(current_tags) 

69 current_forms = [] 

70 current_tags = [] 

71 

72 seen_italics = "__I__" in split_text 

73 seen_bold = "__B__" in split_text 

74 inside_parens = False 

75 inside_bold = False 

76 inside_link = False 

77 inside_italics = False 

78 

79 previous_token_was_period = False 

80 for i, t in enumerate(split_text): 

81 # print(f"{i}: {t=}") 

82 # print(f"{current_forms=}, {current_tags=}. Now: {t=}") 

83 t2 = t.strip() 

84 if not t2 and t and previous_token_was_period: 84 ↛ 87line 84 didn't jump to line 87 because the condition on line 84 was never true

85 # Whitespace 

86 # print("Prev. was dot") 

87 previous_token_was_period = False 

88 push_new_block() 

89 continue 

90 t = t2 

91 if i % 2 == 0: 

92 previous_token_was_period = False 

93 if t in ("", "ή"): 

94 continue 

95 elif t in ("και", "&", ":", ".:"): 

96 push_new_block() 

97 continue 

98 

99 if i % 2 == 0: 

100 # Odd elements: text 

101 

102 if t == ".": 

103 previous_token_was_period = True 

104 continue 

105 

106 # Check if word is not in greek; if it's not, that's a form. 

107 # XXX this might be problematic if there's a stretch of unbolded 

108 # text where the bolding has just been forgotten. Fix by 

109 # checking each word for greekness? 

110 # This doesn't need to check if the language we're processing 

111 # is greek or not, because all non-greek words are 'forms'. 

112 found_language_code = False 

113 is_foreign_script = False 

114 for ch in t: 

115 if not ch.isalpha(): 

116 continue 

117 if not unicode_name(ch).startswith("GREEK"): 

118 if code_to_name(t) != "": 

119 found_language_code = True 

120 break 

121 is_foreign_script = True 

122 break 

123 

124 if found_language_code: 

125 break 

126 

127 if inside_italics: 

128 # Italicized words should always be tags 

129 current_tags.append(t) 

130 continue 

131 if is_foreign_script: 

132 current_forms.append(t) 

133 continue 

134 if inside_bold: 

135 # Bolded words should always be forms 

136 # Split off any suffixes inside the same bold node. 

137 suffixes = SUFFIXES_RE.split(t) 

138 for f in suffixes: 

139 f = f.strip() 

140 if not f: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 continue 

142 current_forms.append(f) 

143 # print(f"inside_bold {t=} {current_forms=}") 

144 continue 

145 

146 if inside_link or ( 146 ↛ 154line 146 didn't jump to line 154 because the condition on line 146 was never true

147 not inside_italics and not inside_bold and not seen_bold 

148 ): 

149 # Usually a form, sometimes a tag... 

150 # XXX handle titles with whitespace by doing the splitting 

151 # in N steps: there's one space, split A B C D with 

152 # A B, C D and (A), B C, (D) 

153 

154 if ( 

155 seen_italics and not seen_bold 

156 ): # there has been text in italics before 

157 current_forms.append(t) 

158 continue 

159 words = t.split() 

160 orig_words = (wxr.wtp.title or "").split() 

161 

162 if len(words) < len(orig_words): 

163 # The phrase we're looking at it shorter than the article 

164 # title in words; unlikely that a form like this loses 

165 # words (more likely to add words) so consider this a 

166 # tag: XXX if this turns out to be problematic 

167 current_tags.append(t) 

168 continue 

169 

170 matches = 0 

171 

172 for word in words: 

173 if distw(orig_words, word) < 0.4: 

174 matches += 1 

175 break 

176 

177 if matches > 0: # XXX use better heuristic; problem is that 

178 # percentage-wise, if you add two words to 

179 # one word then the percentage needed is low 

180 # to match it. 

181 current_forms.append(t) 

182 continue 

183 

184 current_tags.append(t) 

185 continue 

186 

187 continue 

188 

189 # Even elements: splitter tokens like commas, parens or formatting 

190 match t: 

191 case "(": 

192 if current_forms and current_tags: 

193 push_new_block() 

194 else: 

195 extend_old_block() 

196 # We don't support nested parens; XXX if there's a problem 

197 # with them 

198 inside_parens = True 

199 case ")": 

200 inside_parens = False 

201 # print(f"{current_forms=}, {current_tags=}, {t=}") 

202 if ( 202 ↛ 210line 202 didn't jump to line 210 because the condition on line 202 was never true

203 not current_forms 

204 and len(current_tags) == 1 

205 and code_to_name(current_tags[0]) != "" 

206 ): 

207 # There are a lot of `(en)` language code tags that we 

208 # don't care about because they're just repeating the 

209 # language code of the word entry itself! 

210 current_tags = [] 

211 continue 

212 if current_forms and current_tags: 

213 push_new_block() 

214 else: 

215 extend_old_block() 

216 case ",": 

217 if not inside_parens: 217 ↛ 80line 217 didn't jump to line 80 because the condition on line 217 was always true

218 if current_forms and current_tags: 

219 push_new_block() 

220 else: 

221 extend_old_block() 

222 case ":": 222 ↛ 223line 222 didn't jump to line 223 because the pattern on line 222 never matched

223 if not inside_parens: 

224 # Do not append to previous. `:` should, logically, 

225 # always point forward 

226 push_new_block() 

227 case ".": 227 ↛ 228line 227 didn't jump to line 228 because the pattern on line 227 never matched

228 if not inside_parens: 

229 push_new_block() 

230 case "__B__": 

231 # print(f"{current_forms=}, {current_tags=}") 

232 if not inside_parens and current_forms and current_tags: 232 ↛ 233line 232 didn't jump to line 233 because the condition on line 232 was never true

233 push_new_block() 

234 elif not inside_parens: 

235 extend_old_block() 

236 inside_bold = True 

237 case "__/B__": 

238 inside_bold = False 

239 case "__L__": 

240 inside_link = True 

241 case "__/L__": 

242 inside_link = False 

243 case "__I__": 

244 inside_italics = True 

245 case "__/I__": 245 ↛ 247line 245 didn't jump to line 247 because the pattern on line 245 always matched

246 inside_italics = False 

247 case _: 

248 pass 

249 # print(f"{t=}, {blocks=}") 

250 if len(current_forms) > 0 and len(current_tags) > 0: 

251 push_new_block() 

252 else: 

253 extend_old_block() 

254 

255 ret: list[Form] = [] 

256 

257 for forms, raw_tags in blocks: 

258 # print(f"{forms=}, {raw_tags=}") 

259 raw_tags = sorted(set(raw_tags)) 

260 for form in forms: 

261 ret.append( 

262 Form( 

263 form=form, 

264 raw_tags=raw_tags, 

265 source="header", 

266 ) 

267 ) 

268 

269 return ret