Coverage for src/wiktextract/extractor/el/head.py: 77%
165 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
2from unicodedata import name as unicode_name
4from mediawiki_langcodes import code_to_name
6from wiktextract.clean import clean_value
7from wiktextract.extractor.en.form_descriptions import distw
8from wiktextract.wxr_context import WiktextractContext
10from .models import Form
12BOLD_RE = re.compile(r"(__/?[BIL]__|\(|\)|, |\. |: )")
15def parse_head(wxr: WiktextractContext, text: str) -> list[Form]:
16 text = clean_value(wxr, text)
17 split_text = BOLD_RE.split(text)
18 # print(split_text)
20 if not split_text[0] == "":
21 # This should always be True; maybe an assert?
22 # Turns out *some* articles add `-` before the template, like funa...
23 if split_text[0] in ("-", "το "): 23 ↛ 31line 23 didn't jump to line 31 because the condition on line 23 was always true
24 if len(split_text) > 3: 24 ↛ 29line 24 didn't jump to line 29 because the condition on line 24 was always true
25 # Just throw the prefix into the (probably) bolded text
26 split_text[2] = split_text[0] + split_text[2]
27 split_text[0] = ""
28 else:
29 return []
30 else:
31 return []
33 forms: list[Form] = []
34 # print_blocks = []
36 for form_ret in partition_head_forms(wxr, split_text):
37 # print_blocks.append(form_block)
38 # logger.info(f"\n §§ {form_ret}")
39 forms.append(form_ret)
41 # logger.info(
42 # f"\n §§ {wxr.wtp.title} -> {''.join(split_text)}\n § "
43 # + "\n § ".join(f"{''.join(pb)}" for pb in print_blocks)
44 # )
45 return forms
48# Sometimes bolded sections of the head are just smooshed together; what
49# I've seen, it's "form -a -b -c", that is, suffixes.
50SUFFIXES_RE = re.compile(r"\s+(-\w+)\b")
53def partition_head_forms(
54 wxr: WiktextractContext, split_text: list[str]
55) -> list[Form]:
56 if len(split_text) < 2: 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 wxr.wtp.error(
58 f"Failed to partition head forms; too few items {split_text=}",
59 sortid="head/50/20250303",
60 )
61 return []
63 Forms = list[str]
64 Tags = list[str]
65 blocks: list[tuple[Forms, Tags]] = [([], [])]
66 current_forms: Forms = []
67 current_tags: Tags = []
69 def push_new_block() -> None:
70 nonlocal current_forms
71 nonlocal current_tags
72 blocks.append((current_forms, current_tags))
73 current_forms = []
74 current_tags = []
76 def extend_old_block() -> None:
77 nonlocal current_forms
78 nonlocal current_tags
79 blocks[-1][0].extend(current_forms)
80 blocks[-1][1].extend(current_tags)
81 current_forms = []
82 current_tags = []
84 seen_italics = "__I__" in split_text
85 seen_bold = "__B__" in split_text
86 inside_parens = False
87 inside_bold = False
88 inside_link = False
89 inside_italics = False
91 previous_token_was_period = False
92 for i, t in enumerate(split_text):
93 # print(f"{i}: {t=}")
94 # print(f"{current_forms=}, {current_tags=}. Now: {t=}")
95 t2 = t.strip()
96 if not t2 and t and previous_token_was_period: 96 ↛ 99line 96 didn't jump to line 99 because the condition on line 96 was never true
97 # Whitespace
98 # print("Prev. was dot")
99 previous_token_was_period = False
100 push_new_block()
101 continue
102 t = t2
103 if i % 2 == 0:
104 previous_token_was_period = False
105 if t in ("", "ή"):
106 continue
107 elif t in ("και", "&", ":", ".:"):
108 push_new_block()
109 continue
111 if i % 2 == 0:
112 # Odd elements: text
114 if t == ".":
115 previous_token_was_period = True
116 continue
118 # Check if word is not in greek; if it's not, that's a form.
119 # XXX this might be problematic if there's a stretch of unbolded
120 # text where the bolding has just been forgotten. Fix by
121 # checking each word for greekness?
122 # This doesn't need to check if the language we're processing
123 # is greek or not, because all non-greek words are 'forms'.
124 found_language_code = False
125 is_foreign_script = False
126 for ch in t:
127 if not ch.isalpha():
128 continue
129 if not unicode_name(ch).startswith("GREEK"):
130 if code_to_name(t) != "":
131 found_language_code = True
132 break
133 is_foreign_script = True
134 break
136 if found_language_code:
137 break
139 if inside_italics:
140 # Italicized words should always be tags
141 current_tags.append(t)
142 continue
143 if is_foreign_script:
144 current_forms.append(t)
145 continue
146 if inside_bold:
147 # Bolded words should always be forms
148 # Split off any suffixes inside the same bold node.
149 suffixes = SUFFIXES_RE.split(t)
150 for f in suffixes:
151 f = f.strip()
152 if not f: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true
153 continue
154 current_forms.append(f)
155 # print(f"inside_bold {t=} {current_forms=}")
156 continue
158 if inside_link or ( 158 ↛ 166line 158 didn't jump to line 166 because the condition on line 158 was never true
159 not inside_italics and not inside_bold and not seen_bold
160 ):
161 # Usually a form, sometimes a tag...
162 # XXX handle titles with whitespace by doing the splitting
163 # in N steps: there's one space, split A B C D with
164 # A B, C D and (A), B C, (D)
166 if (
167 seen_italics and not seen_bold
168 ): # there has been text in italics before
169 current_forms.append(t)
170 continue
171 words = t.split()
172 orig_words = (wxr.wtp.title or "").split()
174 if len(words) < len(orig_words):
175 # The phrase we're looking at it shorter than the article
176 # title in words; unlikely that a form like this loses
177 # words (more likely to add words) so consider this a
178 # tag: XXX if this turns out to be problematic
179 current_tags.append(t)
180 continue
182 matches = 0
184 for word in words:
185 if distw(orig_words, word) < 0.4:
186 matches += 1
187 break
189 if matches > 0: # XXX use better heuristic; problem is that
190 # percentage-wise, if you add two words to
191 # one word then the percentage needed is low
192 # to match it.
193 current_forms.append(t)
194 continue
196 current_tags.append(t)
197 continue
199 continue
201 # Even elements: splitter tokens like commas, parens or formatting
202 match t:
203 case "(":
204 if current_forms and current_tags:
205 push_new_block()
206 else:
207 extend_old_block()
208 # We don't support nested parens; XXX if there's a problem
209 # with them
210 inside_parens = True
211 case ")":
212 inside_parens = False
213 # print(f"{current_forms=}, {current_tags=}, {t=}")
214 if ( 214 ↛ 222line 214 didn't jump to line 222 because the condition on line 214 was never true
215 not current_forms
216 and len(current_tags) == 1
217 and code_to_name(current_tags[0]) != ""
218 ):
219 # There are a lot of `(en)` language code tags that we
220 # don't care about because they're just repeating the
221 # language code of the word entry itself!
222 current_tags = []
223 continue
224 if current_forms and current_tags:
225 push_new_block()
226 else:
227 extend_old_block()
228 case ",":
229 if not inside_parens: 229 ↛ 92line 229 didn't jump to line 92 because the condition on line 229 was always true
230 if current_forms and current_tags:
231 push_new_block()
232 else:
233 extend_old_block()
234 case ":": 234 ↛ 235line 234 didn't jump to line 235 because the pattern on line 234 never matched
235 if not inside_parens:
236 # Do not append to previous. `:` should, logically,
237 # always point forward
238 push_new_block()
239 case ".": 239 ↛ 240line 239 didn't jump to line 240 because the pattern on line 239 never matched
240 if not inside_parens:
241 push_new_block()
242 case "__B__":
243 # print(f"{current_forms=}, {current_tags=}")
244 if not inside_parens and current_forms and current_tags: 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true
245 push_new_block()
246 elif not inside_parens:
247 extend_old_block()
248 inside_bold = True
249 case "__/B__":
250 inside_bold = False
251 case "__L__":
252 inside_link = True
253 case "__/L__":
254 inside_link = False
255 case "__I__":
256 inside_italics = True
257 case "__/I__": 257 ↛ 259line 257 didn't jump to line 259 because the pattern on line 257 always matched
258 inside_italics = False
259 case _:
260 pass
261 # print(f"{t=}, {blocks=}")
262 if len(current_forms) > 0 and len(current_tags) > 0:
263 push_new_block()
264 else:
265 extend_old_block()
267 ret: list[Form] = []
269 for forms, raw_tags in blocks:
270 # print(f"{forms=}, {raw_tags=}")
271 raw_tags = sorted(set(raw_tags))
272 for form in forms:
273 ret.append(
274 Form(
275 form=form,
276 raw_tags=raw_tags,
277 source="header",
278 )
279 )
281 return ret