Coverage for src/wiktextract/extractor/el/head.py: 74%
165 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from unicodedata import name as unicode_name
4from mediawiki_langcodes import code_to_name
6from wiktextract.extractor.en.form_descriptions import distw
7from wiktextract.wxr_context import WiktextractContext
9from .models import Form, WordEntry
11BOLD_RE = re.compile(r"(__/?[BIL]__|\(|\)|, |\. |: )")
14def parse_head(wxr: WiktextractContext, pos_data: WordEntry, text: str) -> bool:
15 split_text = BOLD_RE.split(text)
16 # print(split_text)
18 if not split_text[0] == "":
19 # This should always be True; maybe an assert?
20 # Turns out *some* articles add `-` before the template, like funa...
21 if split_text[0] in ("-", "το "): 21 ↛ 29line 21 didn't jump to line 29 because the condition on line 21 was always true
22 if len(split_text) > 3: 22 ↛ 27line 22 didn't jump to line 27 because the condition on line 22 was always true
23 # Just throw the prefix into the (probably) bolded text
24 split_text[2] = split_text[0] + split_text[2]
25 split_text[0] = ""
26 else:
27 return False
28 else:
29 return False
31 forms: list[Form] = []
32 # print_blocks = []
34 for form_ret in partition_head_forms(wxr, split_text):
35 # print_blocks.append(form_block)
36 # logger.info(f"\n §§ {form_ret}")
37 forms.append(form_ret)
39 # logger.info(
40 # f"\n §§ {wxr.wtp.title} -> {''.join(split_text)}\n § "
41 # + "\n § ".join(f"{''.join(pb)}" for pb in print_blocks)
42 # )
43 if len(forms) == 0: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 return False
46 pos_data.forms = forms
48 return True
51# Sometimes bolded sections of the head are just smooshed together; what
52# I've seen, it's "form -a -b -c", that is, suffixes.
53SUFFIXES_RE = re.compile(r"\s+(-\w+)\b")
56def partition_head_forms(
57 wxr: WiktextractContext, split_text: list[str]
58) -> list[Form]:
59 if len(split_text) < 2: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 wxr.wtp.error(
61 f"Failed to partition head forms; " f"too few items {split_text=}",
62 sortid="head/50/20250303",
63 )
64 return []
66 Forms = list[str]
67 Tags = list[str]
68 blocks: list[tuple[Forms, Tags]] = [([], [])]
69 current_forms: Forms = []
70 current_tags: Tags = []
72 def push_new_block():
73 nonlocal current_forms
74 nonlocal current_tags
75 blocks.append((current_forms, current_tags))
76 current_forms = []
77 current_tags = []
79 def extend_old_block():
80 nonlocal current_forms
81 nonlocal current_tags
82 blocks[-1][0].extend(current_forms)
83 blocks[-1][1].extend(current_tags)
84 current_forms = []
85 current_tags = []
87 seen_italics = "__I__" in split_text
88 seen_bold = "__B__" in split_text
89 inside_parens = False
90 inside_bold = False
91 inside_link = False
92 inside_italics = False
94 previous_token_was_period = False
95 for i, t in enumerate(split_text):
96 # print(f"{i}: {t=}")
97 t2 = t.strip()
98 if not t2 and t and previous_token_was_period: 98 ↛ 101line 98 didn't jump to line 101 because the condition on line 98 was never true
99 # Whitespace
100 # print("Prev. was dot")
101 previous_token_was_period = False
102 push_new_block()
103 continue
104 t = t2
105 if i % 2 == 0:
106 previous_token_was_period = False
107 if t in ("", "ή"):
108 continue
109 elif t in ("και", "&", ":", ".:"):
110 push_new_block()
111 continue
113 if i % 2 == 0:
114 # Odd elements: text
116 if t == ".":
117 previous_token_was_period = True
118 continue
120 # Check if word is not in greek; if it's not, that's a form.
121 # XXX this might be problematic if there's a stretch of unbolded
122 # text where the bolding has just been forgotten. Fix by
123 # checking each word for greekness?
124 # This doesn't need to check if the language we're processing
125 # is greek or not, because all non-greek words are 'forms'.
126 found_language_code = False
127 is_foreign_script = False
128 for ch in t:
129 if not ch.isalpha():
130 continue
131 if not unicode_name(ch).startswith("GREEK"):
132 if code_to_name(t) != "":
133 found_language_code = True
134 break
135 is_foreign_script = True
136 break
138 if found_language_code:
139 break
141 if inside_italics:
142 # Italicized words should always be tags
143 current_tags.append(t)
144 continue
145 if is_foreign_script:
146 current_forms.append(t)
147 continue
148 if inside_bold:
149 # Bolded words should always be forms
150 # Split off any suffixes inside the same bold node.
151 suffixes = SUFFIXES_RE.split(t)
152 for f in suffixes:
153 f = f.strip()
154 if not f: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 continue
156 current_forms.append(f)
157 continue
159 if inside_link or ( 159 ↛ 167line 159 didn't jump to line 167 because the condition on line 159 was never true
160 not inside_italics and not inside_bold and not seen_bold
161 ):
162 # Usually a form, sometimes a tag...
163 # XXX handle titles with whitespace by doing the splitting
164 # in N steps: there's one space, split A B C D with
165 # A B, C D and (A), B C, (D)
167 if (
168 seen_italics and not seen_bold
169 ): # there has been text in italics before
170 current_forms.append(t)
171 continue
172 words = t.split()
173 orig_words = (wxr.wtp.title or "").split()
175 if len(words) < len(orig_words):
176 # The phrase we're looking at it shorter than the article
177 # title in words; unlikely that a form like this loses
178 # words (more likely to add words) so consider this a
179 # tag: XXX if this turns out to be problematic
180 current_tags.append(t)
181 continue
183 matches = 0
185 for word in words:
186 if distw(orig_words, word) < 0.4:
187 matches += 1
188 break
190 if matches > 0: # XXX use better heuristic; problem is that
191 # percentage-wise, if you add two words to
192 # one word then the percentage needed is low
193 # to match it.
194 current_forms.append(t)
195 continue
197 current_tags.append(t)
198 continue
200 continue
202 # Even elements: splitter tokens like commas, parens or formatting
203 match t:
204 case "(":
205 if current_forms and current_tags: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true
206 push_new_block()
207 else:
208 extend_old_block()
209 # We don't support nested parens; XXX if there's a problem
210 # with them
211 inside_parens = True
212 case ")":
213 inside_parens = False
214 # print(f"{current_forms=}, {current_tags=}, {t=}")
215 if ( 215 ↛ 223line 215 didn't jump to line 223 because the condition on line 215 was never true
216 not current_forms
217 and len(current_tags) == 1
218 and code_to_name(current_tags[0]) != ""
219 ):
220 # There are a lot of `(en)` language code tags that we
221 # don't care about because they're just repeating the
222 # language code of the word entry itself!
223 current_tags = []
224 continue
225 if current_forms and current_tags: 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true
226 push_new_block()
227 else:
228 extend_old_block()
229 case ",":
230 if not inside_parens: 230 ↛ 95line 230 didn't jump to line 95 because the condition on line 230 was always true
231 if current_forms and current_tags:
232 push_new_block()
233 else:
234 extend_old_block()
235 case ":": 235 ↛ 236line 235 didn't jump to line 236 because the pattern on line 235 never matched
236 if not inside_parens:
237 # Do not append to previous. `:` should, logically,
238 # always point forward
239 push_new_block()
240 case ".": 240 ↛ 241line 240 didn't jump to line 241 because the pattern on line 240 never matched
241 if not inside_parens:
242 push_new_block()
243 case "__B__":
244 # print(f"{current_forms=}, {current_tags=}")
245 if current_forms and current_tags: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true
246 push_new_block()
247 else:
248 extend_old_block()
249 inside_bold = True
250 case "__/B__":
251 inside_bold = False
252 case "__L__":
253 inside_link = True
254 case "__/L__":
255 inside_link = False
256 case "__I__":
257 inside_italics = True
258 case "__/I__": 258 ↛ 260line 258 didn't jump to line 260 because the pattern on line 258 always matched
259 inside_italics = False
260 case _:
261 pass
262 # print(f"{t=}, {blocks=}")
263 if len(current_forms) > 0 and len(current_tags) > 0: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true
264 push_new_block()
265 else:
266 extend_old_block()
268 ret: list[Form] = []
270 for forms, tags in blocks:
271 # print(f"{forms=}, {tags=}")
272 tags = sorted(set(tags))
274 for form in forms:
275 ret.append(Form(form=form, raw_tags=tags))
277 return ret