Coverage for src / wiktextract / extractor / el / head.py: 77%
162 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from unicodedata import name as unicode_name
4from mediawiki_langcodes import code_to_name
6from wiktextract.clean import clean_value
7from wiktextract.extractor.en.form_descriptions import distw
8from wiktextract.wxr_context import WiktextractContext
10from .models import Form
12BOLD_RE = re.compile(r"(__/?[BIL]__|\(|\)|, |\. |: )")
15def parse_head(wxr: WiktextractContext, text: str) -> list[Form]:
16 text = clean_value(wxr, text)
17 split_text = BOLD_RE.split(text)
18 # print(split_text)
20 if not split_text[0] == "":
21 # This should always be True; maybe an assert?
22 # Turns out *some* articles add `-` before the template, like funa...
23 if split_text[0] in ("-", "το "): 23 ↛ 31line 23 didn't jump to line 31 because the condition on line 23 was always true
24 if len(split_text) > 3: 24 ↛ 29line 24 didn't jump to line 29 because the condition on line 24 was always true
25 # Just throw the prefix into the (probably) bolded text
26 split_text[2] = split_text[0] + split_text[2]
27 split_text[0] = ""
28 else:
29 return []
30 else:
31 return []
33 return partition_head_forms(wxr, split_text)
36# Sometimes bolded sections of the head are just smooshed together; what
37# I've seen, it's "form -a -b -c", that is, suffixes.
38SUFFIXES_RE = re.compile(r"\s+(-\w+)\b")
41def partition_head_forms(
42 wxr: WiktextractContext, split_text: list[str]
43) -> list[Form]:
44 if len(split_text) < 2: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 wxr.wtp.error(
46 f"Failed to partition head forms; too few items {split_text=}",
47 sortid="head/50/20250303",
48 )
49 return []
51 Forms = list[str]
52 Tags = list[str]
53 blocks: list[tuple[Forms, Tags]] = [([], [])]
54 current_forms: Forms = []
55 current_tags: Tags = []
57 def push_new_block() -> None:
58 nonlocal current_forms
59 nonlocal current_tags
60 blocks.append((current_forms, current_tags))
61 current_forms = []
62 current_tags = []
64 def extend_old_block() -> None:
65 nonlocal current_forms
66 nonlocal current_tags
67 blocks[-1][0].extend(current_forms)
68 blocks[-1][1].extend(current_tags)
69 current_forms = []
70 current_tags = []
72 seen_italics = "__I__" in split_text
73 seen_bold = "__B__" in split_text
74 inside_parens = False
75 inside_bold = False
76 inside_link = False
77 inside_italics = False
79 previous_token_was_period = False
80 for i, t in enumerate(split_text):
81 # print(f"{i}: {t=}")
82 # print(f"{current_forms=}, {current_tags=}. Now: {t=}")
83 t2 = t.strip()
84 if not t2 and t and previous_token_was_period: 84 ↛ 87line 84 didn't jump to line 87 because the condition on line 84 was never true
85 # Whitespace
86 # print("Prev. was dot")
87 previous_token_was_period = False
88 push_new_block()
89 continue
90 t = t2
91 if i % 2 == 0:
92 previous_token_was_period = False
93 if t in ("", "ή"):
94 continue
95 elif t in ("και", "&", ":", ".:"):
96 push_new_block()
97 continue
99 if i % 2 == 0:
100 # Odd elements: text
102 if t == ".":
103 previous_token_was_period = True
104 continue
106 # Check if word is not in greek; if it's not, that's a form.
107 # XXX this might be problematic if there's a stretch of unbolded
108 # text where the bolding has just been forgotten. Fix by
109 # checking each word for greekness?
110 # This doesn't need to check if the language we're processing
111 # is greek or not, because all non-greek words are 'forms'.
112 found_language_code = False
113 is_foreign_script = False
114 for ch in t:
115 if not ch.isalpha():
116 continue
117 if not unicode_name(ch).startswith("GREEK"):
118 if code_to_name(t) != "":
119 found_language_code = True
120 break
121 is_foreign_script = True
122 break
124 if found_language_code:
125 break
127 if inside_italics:
128 # Italicized words should always be tags
129 current_tags.append(t)
130 continue
131 if is_foreign_script:
132 current_forms.append(t)
133 continue
134 if inside_bold:
135 # Bolded words should always be forms
136 # Split off any suffixes inside the same bold node.
137 suffixes = SUFFIXES_RE.split(t)
138 for f in suffixes:
139 f = f.strip()
140 if not f: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 continue
142 current_forms.append(f)
143 # print(f"inside_bold {t=} {current_forms=}")
144 continue
146 if inside_link or ( 146 ↛ 154line 146 didn't jump to line 154 because the condition on line 146 was never true
147 not inside_italics and not inside_bold and not seen_bold
148 ):
149 # Usually a form, sometimes a tag...
150 # XXX handle titles with whitespace by doing the splitting
151 # in N steps: there's one space, split A B C D with
152 # A B, C D and (A), B C, (D)
154 if (
155 seen_italics and not seen_bold
156 ): # there has been text in italics before
157 current_forms.append(t)
158 continue
159 words = t.split()
160 orig_words = (wxr.wtp.title or "").split()
162 if len(words) < len(orig_words):
163 # The phrase we're looking at it shorter than the article
164 # title in words; unlikely that a form like this loses
165 # words (more likely to add words) so consider this a
166 # tag: XXX if this turns out to be problematic
167 current_tags.append(t)
168 continue
170 matches = 0
172 for word in words:
173 if distw(orig_words, word) < 0.4:
174 matches += 1
175 break
177 if matches > 0: # XXX use better heuristic; problem is that
178 # percentage-wise, if you add two words to
179 # one word then the percentage needed is low
180 # to match it.
181 current_forms.append(t)
182 continue
184 current_tags.append(t)
185 continue
187 continue
189 # Even elements: splitter tokens like commas, parens or formatting
190 match t:
191 case "(":
192 if current_forms and current_tags:
193 push_new_block()
194 else:
195 extend_old_block()
196 # We don't support nested parens; XXX if there's a problem
197 # with them
198 inside_parens = True
199 case ")":
200 inside_parens = False
201 # print(f"{current_forms=}, {current_tags=}, {t=}")
202 if ( 202 ↛ 210line 202 didn't jump to line 210 because the condition on line 202 was never true
203 not current_forms
204 and len(current_tags) == 1
205 and code_to_name(current_tags[0]) != ""
206 ):
207 # There are a lot of `(en)` language code tags that we
208 # don't care about because they're just repeating the
209 # language code of the word entry itself!
210 current_tags = []
211 continue
212 if current_forms and current_tags:
213 push_new_block()
214 else:
215 extend_old_block()
216 case ",":
217 if not inside_parens: 217 ↛ 80line 217 didn't jump to line 80 because the condition on line 217 was always true
218 if current_forms and current_tags:
219 push_new_block()
220 else:
221 extend_old_block()
222 case ":": 222 ↛ 223line 222 didn't jump to line 223 because the pattern on line 222 never matched
223 if not inside_parens:
224 # Do not append to previous. `:` should, logically,
225 # always point forward
226 push_new_block()
227 case ".": 227 ↛ 228line 227 didn't jump to line 228 because the pattern on line 227 never matched
228 if not inside_parens:
229 push_new_block()
230 case "__B__":
231 # print(f"{current_forms=}, {current_tags=}")
232 if not inside_parens and current_forms and current_tags: 232 ↛ 233line 232 didn't jump to line 233 because the condition on line 232 was never true
233 push_new_block()
234 elif not inside_parens:
235 extend_old_block()
236 inside_bold = True
237 case "__/B__":
238 inside_bold = False
239 case "__L__":
240 inside_link = True
241 case "__/L__":
242 inside_link = False
243 case "__I__":
244 inside_italics = True
245 case "__/I__": 245 ↛ 247line 245 didn't jump to line 247 because the pattern on line 245 always matched
246 inside_italics = False
247 case _:
248 pass
249 # print(f"{t=}, {blocks=}")
250 if len(current_forms) > 0 and len(current_tags) > 0:
251 push_new_block()
252 else:
253 extend_old_block()
255 ret: list[Form] = []
257 for forms, raw_tags in blocks:
258 # print(f"{forms=}, {raw_tags=}")
259 raw_tags = sorted(set(raw_tags))
260 for form in forms:
261 ret.append(
262 Form(
263 form=form,
264 raw_tags=raw_tags,
265 source="header",
266 )
267 )
269 return ret