Coverage for src/wiktextract/extractor/el/head.py: 72%

1import re

3from mediawiki_langcodes import code_to_name

5from unicodedata import name as unicode_name

7from wiktextract.extractor.en.form_descriptions import distw

8from wiktextract.wxr_context import WiktextractContext

9from wiktextract.wxr_logging import logger

11from .models import Form, WordEntry

13BOLD_RE = re.compile(r"(__/?[BIL]__|\(|\)|, |\. |: )")

16def parse_head(wxr: WiktextractContext, pos_data: WordEntry, text: str) -> bool:

17 split_text = BOLD_RE.split(text)

18 # print(split_text)

20 if not split_text[0] == "":

21 # This should always be True; maybe an assert?

22 # Turns out *some* articles add `-` before the template, like funa...

23 if split_text[0] in ("-", "το "): 23 ↛ 31line 23 didn't jump to line 31 because the condition on line 23 was always true

24 if len(split_text) > 3: 24 ↛ 29line 24 didn't jump to line 29 because the condition on line 24 was always true

25 # Just throw the prefix into the (probably) bolded text

26 split_text[2] = split_text[0] + split_text[2]

27 split_text[0] = ''

28 else:

29 return False

30 else:

31 return False

33 forms: list[Form] = []

34 # print_blocks = []

36 for form_ret in partition_head_forms(wxr, split_text):

37 # print_blocks.append(form_block)

38 # logger.info(f"\n §§ {form_ret}")

39 forms.append(form_ret)

41 # logger.info(

42 # f"\n §§ {wxr.wtp.title} -> {''.join(split_text)}\n § "

43 # + "\n § ".join(f"{''.join(pb)}" for pb in print_blocks)

44 # )

45 if len(forms) == 0: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 return False

48 pos_data.forms = forms

50 return True

53# Sometimes bolded sections of the head are just smooshed together; what

54# I've seen, it's "form -a -b -c", that is, suffixes.

55SUFFIXES_RE = re.compile(r"\s+(-\w+)\b")

58def partition_head_forms(

59 wxr: WiktextractContext, split_text: list[str]

60) -> list[Form]:

61 if len(split_text) < 2: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 wxr.wtp.error(

63 f"Failed to partition head forms; " f"too few items {split_text=}",

64 sortid="head/50/20250303",

65 )

66 return []

68 Forms = list[str]

69 Tags = list[str]

70 blocks: list[tuple[Forms, Tags]] = [([], [])]

71 current_forms: Forms = []

72 current_tags: Tags = []

74 def push_new_block():

75 nonlocal current_forms

76 nonlocal current_tags

77 blocks.append((current_forms, current_tags))

78 current_forms = []

79 current_tags = []

81 def extend_old_block():

82 nonlocal current_forms

83 nonlocal current_tags

84 blocks[-1][0].extend(current_forms)

85 blocks[-1][1].extend(current_tags)

86 current_forms = []

87 current_tags = []

89 seen_italics = False

90 inside_parens = False

91 inside_bold = False

92 inside_link = False

93 inside_italics = False

95 previous_token_was_period = False

96 for i, t in enumerate(split_text):

97 # print(f"{i}: {t=}")

98 t2 = t.strip()

99 if not t2 and t and previous_token_was_period: 99 ↛ 102line 99 didn't jump to line 102 because the condition on line 99 was never true

100 # Whitespace

101 # print("Prev. was dot")

102 previous_token_was_period = False

103 push_new_block()

104 continue

105 t = t2

106 if i % 2 == 0:

107 previous_token_was_period = False

108 if t in ("", "ή"):

109 continue

110 elif t in ("και", "&", ":", ".:"):

111 push_new_block()

112 continue

113

114 if i % 2 == 0:

115 # Odd elements: text

116

117 if t == ".":

118 previous_token_was_period = True

119 continue

120

121 # Check if word is not in greek; if it's not, that's a form.

122 # XXX this might be problematic if there's a stretch of unbolded

123 # text where the bolding has just been forgotten. Fix by

124 # checking each word for greekness?

125 # This doesn't need to check if the language we're processing

126 # is greek or not, because all non-greek words are 'forms'.

127 found_language_code = False

128 is_foreign_script = False

129 for ch in t:

130 if not ch.isalpha():

131 continue

132 if not unicode_name(ch).startswith("GREEK"):

133 if code_to_name(t) != "":

134 found_language_code = True

135 break

136 is_foreign_script = True

137 break

138

139 if found_language_code:

140 break

141

142 if inside_italics:

143 # Italicized words should always be tags

144 current_tags.append(t)

145 continue

146 if is_foreign_script:

147 current_forms.append(t)

148 continue

149 if inside_bold: 149 ↛ 160line 149 didn't jump to line 160 because the condition on line 149 was always true

150 # Bolded words should always be forms

151 # Split off any suffixes inside the same bold node.

152 suffixes = SUFFIXES_RE.split(t)

153 for f in suffixes:

154 f = f.strip()

155 if not f: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 continue

157 current_forms.append(f)

158 continue

159

160 if inside_link or not (

161 inside_link or inside_italics or inside_bold

162 ):

163 # Usually a form, sometimes a tag...

164 # XXX handle titles with whitespace by doing the splitting

165 # in N steps: there's one space, split A B C D with

166 # A B, C D and (A), B C, (D)

167

168 if seen_italics: # there has been text in italics before

169 current_forms.append(t)

170 continue

171 words = t.split()

172 orig_words = (wxr.wtp.title or "").split()

173

174 if len(words) < len(orig_words):

175 # The phrase we're looking at it shorter than the article

176 # title in words; unlikely that a form like this loses

177 # words (more likely to add words) so consider this a

178 # tag: XXX if this turns out to be problematic

179 current_tags.append(t)

180 continue

181

182 matches = 0

183

184 for word in words:

185 if distw(orig_words, word) < 0.4:

186 matches += 1

187 break

188

189 if matches > 0: # XXX use better heuristic; problem is that

190 # percentage-wise, if you add two words to

191 # one word then the percentage needed is low

192 # to match it.

193 current_forms.append(t)

194 continue

195

196 current_tags.append(t)

197 continue

198

199 continue

200

201 # Even elements: splitter tokens like commas, parens or formatting

202 match t:

203 case "(":

204 if current_forms and current_tags: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 push_new_block()

206 else:

207 extend_old_block()

208 # We don't support nested parens; XXX if there's a problem

209 # with them

210 inside_parens = True

211 case ")":

212 inside_parens = False

213 # print(f"{current_forms=}, {current_tags=}, {t=}")

214 if ( 214 ↛ 222line 214 didn't jump to line 222 because the condition on line 214 was never true

215 not current_forms

216 and len(current_tags) == 1

217 and code_to_name(current_tags[0]) != ""

218 ):

219 # There are a lot of `(en)` language code tags that we

220 # don't care about because they're just repeating the

221 # language code of the word entry itself!

222 current_tags = []

223 continue

224 if current_forms and current_tags: 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true

225 push_new_block()

226 else:

227 extend_old_block()

228 case ",":

229 if not inside_parens: 229 ↛ 96line 229 didn't jump to line 96 because the condition on line 229 was always true

230 if current_forms and current_tags: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true

231 push_new_block()

232 else:

233 extend_old_block()

234 case ":": 234 ↛ 235line 234 didn't jump to line 235 because the pattern on line 234 never matched

235 if not inside_parens:

236 # Do not append to previous. `:` should, logically,

237 # always point forward

238 push_new_block()

239 case ".": 239 ↛ 240line 239 didn't jump to line 240 because the pattern on line 239 never matched

240 if not inside_parens:

241 push_new_block()

242 case "__B__":

243 # print(f"{current_forms=}, {current_tags=}")

244 if current_forms and current_tags: 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true

245 push_new_block()

246 else:

247 extend_old_block()

248 inside_bold = True

249 case "__/B__":

250 inside_bold = False

251 case "__L__":

252 inside_link = True

253 case "__/L__":

254 inside_link = False

255 case "__I__":

256 seen_italics = True

257 inside_italics = True

258 case "__/I__": 258 ↛ 260line 258 didn't jump to line 260 because the pattern on line 258 always matched

259 inside_italics = False

260 case _:

261 pass

262 # print(f"{t=}, {blocks=}")

263 if len(current_forms) > 0 and len(current_tags) > 0: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true

264 push_new_block()

265 else:

266 extend_old_block()

267

268 ret: list[Form] = []

269

270 for forms, tags in blocks:

271 # print(f"{forms=}, {tags=}")

272 tags = list(set(tags))

273 for form in forms:

274 ret.append(Form(form=form, raw_tags=tags))

275

276 return ret