Coverage for src/wiktextract/extractor/el/head.py: 72%
166 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
3from mediawiki_langcodes import code_to_name
5from unicodedata import name as unicode_name
7from wiktextract.extractor.en.form_descriptions import distw
8from wiktextract.wxr_context import WiktextractContext
9from wiktextract.wxr_logging import logger
11from .models import Form, WordEntry
13BOLD_RE = re.compile(r"(__/?[BIL]__|\(|\)|, |\. |: )")
16def parse_head(wxr: WiktextractContext, pos_data: WordEntry, text: str) -> bool:
17 split_text = BOLD_RE.split(text)
18 # print(split_text)
20 if not split_text[0] == "":
21 # This should always be True; maybe an assert?
22 # Turns out *some* articles add `-` before the template, like funa...
23 if split_text[0] in ("-", "το "): 23 ↛ 31line 23 didn't jump to line 31 because the condition on line 23 was always true
24 if len(split_text) > 3: 24 ↛ 29line 24 didn't jump to line 29 because the condition on line 24 was always true
25 # Just throw the prefix into the (probably) bolded text
26 split_text[2] = split_text[0] + split_text[2]
27 split_text[0] = ''
28 else:
29 return False
30 else:
31 return False
33 forms: list[Form] = []
34 # print_blocks = []
36 for form_ret in partition_head_forms(wxr, split_text):
37 # print_blocks.append(form_block)
38 # logger.info(f"\n §§ {form_ret}")
39 forms.append(form_ret)
41 # logger.info(
42 # f"\n §§ {wxr.wtp.title} -> {''.join(split_text)}\n § "
43 # + "\n § ".join(f"{''.join(pb)}" for pb in print_blocks)
44 # )
45 if len(forms) == 0: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 return False
48 pos_data.forms = forms
50 return True
53# Sometimes bolded sections of the head are just smooshed together; what
54# I've seen, it's "form -a -b -c", that is, suffixes.
55SUFFIXES_RE = re.compile(r"\s+(-\w+)\b")
58def partition_head_forms(
59 wxr: WiktextractContext, split_text: list[str]
60) -> list[Form]:
61 if len(split_text) < 2: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 wxr.wtp.error(
63 f"Failed to partition head forms; " f"too few items {split_text=}",
64 sortid="head/50/20250303",
65 )
66 return []
68 Forms = list[str]
69 Tags = list[str]
70 blocks: list[tuple[Forms, Tags]] = [([], [])]
71 current_forms: Forms = []
72 current_tags: Tags = []
74 def push_new_block():
75 nonlocal current_forms
76 nonlocal current_tags
77 blocks.append((current_forms, current_tags))
78 current_forms = []
79 current_tags = []
81 def extend_old_block():
82 nonlocal current_forms
83 nonlocal current_tags
84 blocks[-1][0].extend(current_forms)
85 blocks[-1][1].extend(current_tags)
86 current_forms = []
87 current_tags = []
89 seen_italics = False
90 inside_parens = False
91 inside_bold = False
92 inside_link = False
93 inside_italics = False
95 previous_token_was_period = False
96 for i, t in enumerate(split_text):
97 # print(f"{i}: {t=}")
98 t2 = t.strip()
99 if not t2 and t and previous_token_was_period: 99 ↛ 102line 99 didn't jump to line 102 because the condition on line 99 was never true
100 # Whitespace
101 # print("Prev. was dot")
102 previous_token_was_period = False
103 push_new_block()
104 continue
105 t = t2
106 if i % 2 == 0:
107 previous_token_was_period = False
108 if t in ("", "ή"):
109 continue
110 elif t in ("και", "&", ":", ".:"):
111 push_new_block()
112 continue
114 if i % 2 == 0:
115 # Odd elements: text
117 if t == ".":
118 previous_token_was_period = True
119 continue
121 # Check if word is not in greek; if it's not, that's a form.
122 # XXX this might be problematic if there's a stretch of unbolded
123 # text where the bolding has just been forgotten. Fix by
124 # checking each word for greekness?
125 # This doesn't need to check if the language we're processing
126 # is greek or not, because all non-greek words are 'forms'.
127 found_language_code = False
128 is_foreign_script = False
129 for ch in t:
130 if not ch.isalpha():
131 continue
132 if not unicode_name(ch).startswith("GREEK"):
133 if code_to_name(t) != "":
134 found_language_code = True
135 break
136 is_foreign_script = True
137 break
139 if found_language_code:
140 break
142 if inside_italics:
143 # Italicized words should always be tags
144 current_tags.append(t)
145 continue
146 if is_foreign_script:
147 current_forms.append(t)
148 continue
149 if inside_bold: 149 ↛ 160line 149 didn't jump to line 160 because the condition on line 149 was always true
150 # Bolded words should always be forms
151 # Split off any suffixes inside the same bold node.
152 suffixes = SUFFIXES_RE.split(t)
153 for f in suffixes:
154 f = f.strip()
155 if not f: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true
156 continue
157 current_forms.append(f)
158 continue
160 if inside_link or not (
161 inside_link or inside_italics or inside_bold
162 ):
163 # Usually a form, sometimes a tag...
164 # XXX handle titles with whitespace by doing the splitting
165 # in N steps: there's one space, split A B C D with
166 # A B, C D and (A), B C, (D)
168 if seen_italics: # there has been text in italics before
169 current_forms.append(t)
170 continue
171 words = t.split()
172 orig_words = (wxr.wtp.title or "").split()
174 if len(words) < len(orig_words):
175 # The phrase we're looking at it shorter than the article
176 # title in words; unlikely that a form like this loses
177 # words (more likely to add words) so consider this a
178 # tag: XXX if this turns out to be problematic
179 current_tags.append(t)
180 continue
182 matches = 0
184 for word in words:
185 if distw(orig_words, word) < 0.4:
186 matches += 1
187 break
189 if matches > 0: # XXX use better heuristic; problem is that
190 # percentage-wise, if you add two words to
191 # one word then the percentage needed is low
192 # to match it.
193 current_forms.append(t)
194 continue
196 current_tags.append(t)
197 continue
199 continue
201 # Even elements: splitter tokens like commas, parens or formatting
202 match t:
203 case "(":
204 if current_forms and current_tags: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 push_new_block()
206 else:
207 extend_old_block()
208 # We don't support nested parens; XXX if there's a problem
209 # with them
210 inside_parens = True
211 case ")":
212 inside_parens = False
213 # print(f"{current_forms=}, {current_tags=}, {t=}")
214 if ( 214 ↛ 222line 214 didn't jump to line 222 because the condition on line 214 was never true
215 not current_forms
216 and len(current_tags) == 1
217 and code_to_name(current_tags[0]) != ""
218 ):
219 # There are a lot of `(en)` language code tags that we
220 # don't care about because they're just repeating the
221 # language code of the word entry itself!
222 current_tags = []
223 continue
224 if current_forms and current_tags: 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true
225 push_new_block()
226 else:
227 extend_old_block()
228 case ",":
229 if not inside_parens: 229 ↛ 96line 229 didn't jump to line 96 because the condition on line 229 was always true
230 if current_forms and current_tags: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true
231 push_new_block()
232 else:
233 extend_old_block()
234 case ":": 234 ↛ 235line 234 didn't jump to line 235 because the pattern on line 234 never matched
235 if not inside_parens:
236 # Do not append to previous. `:` should, logically,
237 # always point forward
238 push_new_block()
239 case ".": 239 ↛ 240line 239 didn't jump to line 240 because the pattern on line 239 never matched
240 if not inside_parens:
241 push_new_block()
242 case "__B__":
243 # print(f"{current_forms=}, {current_tags=}")
244 if current_forms and current_tags: 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true
245 push_new_block()
246 else:
247 extend_old_block()
248 inside_bold = True
249 case "__/B__":
250 inside_bold = False
251 case "__L__":
252 inside_link = True
253 case "__/L__":
254 inside_link = False
255 case "__I__":
256 seen_italics = True
257 inside_italics = True
258 case "__/I__": 258 ↛ 260line 258 didn't jump to line 260 because the pattern on line 258 always matched
259 inside_italics = False
260 case _:
261 pass
262 # print(f"{t=}, {blocks=}")
263 if len(current_forms) > 0 and len(current_tags) > 0: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true
264 push_new_block()
265 else:
266 extend_old_block()
268 ret: list[Form] = []
270 for forms, tags in blocks:
271 # print(f"{forms=}, {tags=}")
272 tags = list(set(tags))
273 for form in forms:
274 ret.append(Form(form=form, raw_tags=tags))
276 return ret