Coverage for src/wiktextract/extractor/el/parse_utils.py: 81%
94 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
2import unicodedata
3from typing import Generator, TypeAlias
5from wikitextprocessor import LevelNode, WikiNode
7from wiktextract.page import clean_node
8from wiktextract.wxr_context import WiktextractContext
10from .models import Form, WordEntry
11from .section_titles import (
12 POS_HEADINGS,
13 POS_HEADINGS_RE,
14 SUBSECTION_HEADINGS,
15 SUBSECTIONS_RE,
16 Heading,
17 POSName,
18 Tags,
19)
20from .text_utils import normalized_int
22# Ignorable templates that generate panels to the side, like
23# Template:Wikipedia, or other meta-info like Template:see.
24# Called 'panel templates' because they often generate panels.
25PANEL_TEMPLATES: set[str] = set(
26 [
27 "interwiktionary",
28 "stub",
29 "wik",
30 "wikipedia",
31 "Wikipedia",
32 "wikispecies",
33 "wikiquote",
34 "Wikiquote",
35 "improve",
36 ]
37)
39# Template name prefixes used for language-specific panel templates (i.e.,
40# templates that create side boxes or notice boxes or that should generally
41# be ignored).
42# PANEL_PREFIXES: set[str] = set()
44# Additional templates to be expanded in the pre-expand phase
45# XXX nothing here yet, add as needed if some template turns out to be
46# problematic when unexpanded.
47ADDITIONAL_EXPAND_TEMPLATES: set[str] = set()
49# Names of templates used in etymology sections whose parameters we want
50# to store in `etymology_templates`.
51ETYMOLOGY_TEMPLATES: set[str] = set()
53GREEK_LANGCODES = set(
54 (
55 "el",
56 "grc",
57 "el2",
58 "el-crt",
59 "el-cyp",
60 "gkm",
61 "gkm-cyp",
62 "gkm-crt",
63 "gmy",
64 "gmy2",
65 "grc-dor",
66 "grc-ion",
67 "grc-koi",
68 "grk",
69 "grk-ita",
70 "grk-pro",
71 "kath",
72 "pnt",
73 "pregrc",
74 "tsd",
75 "xme-old",
76 "xmk",
77 )
78)
81Title: TypeAlias = str
82SectionName: TypeAlias = str # The keys of SUBSECTION_HEADINGS
83POSReturns: TypeAlias = list[
84 tuple[POSName, Title, Tags, int, WikiNode, WordEntry]
85]
88def find_sections(
89 wxr: WiktextractContext,
90 nodes: list[WikiNode] | list[LevelNode],
91) -> Generator[
92 tuple[Heading, POSName | SectionName, Title, Tags, int, WikiNode],
93 None,
94 None,
95]:
96 """In practice, only called when we expect heading_type to be either
97 Heading.POS or Heading.Pron.
99 Heading.POS guarantees that pos_or_section is a POSName.
100 Heading.Pron guarantees that pos_or_section is a SectionName, and, looking
101 at SUBSECTION_HEADINGS, either: "pronunciation" or "προφορά"
102 """
103 for node in nodes: 103 ↛ 104line 103 didn't jump to line 104 because the loop on line 103 never started
104 heading_title = clean_node(wxr, None, node.largs[0]).lower().strip()
106 heading_type, pos_or_section, tags, num, ok = parse_lower_heading(
107 wxr, heading_title
108 )
110 if num > 0:
111 wxr.wtp.wiki_notice(
112 f"Sub-sub-section is numbered: {heading_title}, {num=}",
113 sortid="page/find_pos_sections_1",
114 )
115 yield heading_type, pos_or_section, heading_title, tags, num, node
118def parse_lower_heading(
119 wxr: WiktextractContext, heading: str
120) -> tuple[Heading, POSName | SectionName, Tags, int, bool]:
121 """Determine if a heading is for a part of speech or other subsection.
122 Returns heading type enum, POS name or string data, list of tags and a
123 success bool.
124 """
125 if m := POS_HEADINGS_RE.match(heading):
126 heading_type, pos, tags, num, ok = parse_pos_heading(wxr, heading, m)
127 if ok: 127 ↛ 130line 127 didn't jump to line 130 because the condition on line 127 was always true
128 return heading_type, pos, tags, num, True
130 if m := SUBSECTIONS_RE.match(heading): 130 ↛ 137line 130 didn't jump to line 137 because the condition on line 130 was always true
131 heading_type, section, tags, num, ok = parse_section_heading(
132 wxr, heading, m
133 )
134 if ok: 134 ↛ 137line 134 didn't jump to line 137 because the condition on line 134 was always true
135 return heading_type, section, tags, num, True
137 return Heading.Err, "", [], -1, False
140def parse_pos_heading(
141 wxr: WiktextractContext, heading: str, m: re.Match[str]
142) -> tuple[Heading, POSName, Tags, int, bool]:
143 pos_str = m.group(1)
144 rest = m.group(2)
145 post_number = -1
146 if rest: 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was never true
147 # logger.info(f"POS REST: '{rest}'")
148 if rest.strip().isdigit():
149 post_number = normalized_int(rest.strip())
150 # logger.info(f"POST_NUMBER {post_number}")
151 pos_data = POS_HEADINGS[pos_str]
152 return (
153 Heading.POS,
154 pos_data["pos"],
155 pos_data.get("tags", []),
156 post_number,
157 True,
158 )
161def parse_section_heading(
162 wxr: WiktextractContext, heading: str, m: re.Match[str]
163) -> tuple[Heading, SectionName, Tags, int, bool]:
164 subsection_str = m.group(1)
165 rest = m.group(2)
166 post_number = -1
167 if rest: 167 ↛ 169line 167 didn't jump to line 169 because the condition on line 167 was never true
168 # logger.info(f"SUBSECTION REST: '{rest}'")
169 if rest.strip().isdigit():
170 post_number = normalized_int(rest.strip())
171 # logger.info(f"POST_NUMBER {post_number}")
172 section_data = SUBSECTION_HEADINGS[subsection_str]
173 return (
174 section_data["type"],
175 subsection_str,
176 section_data.get("tags", []),
177 post_number,
178 True,
179 )
182# https://stackoverflow.com/a/518232
183def strip_accents(accented: str) -> str:
184 return "".join(
185 c
186 for c in unicodedata.normalize("NFD", accented)
187 if unicodedata.category(c) != "Mn"
188 )
191def remove_duplicate_forms(
192 wxr: WiktextractContext, forms: list[Form]
193) -> list[Form]:
194 """Check for identical `forms` and remove duplicates."""
195 if not forms: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 return []
197 new_forms = []
198 for i, form in enumerate(forms):
199 for comp in forms[i + 1 :]:
200 if (
201 form.form == comp.form
202 and form.tags == comp.tags
203 and form.raw_tags == comp.raw_tags
204 ):
205 break
206 # basically "continue" for the outer for block in this case,
207 # but this will not trigger the following else-block
208 else:
209 # No duplicates found in for loop (exited without breaking)
210 new_forms.append(form)
211 if len(forms) > len(new_forms):
212 # wxr.wtp.debug("Found duplicate forms", sortid="simple/pos/32")
213 return new_forms
214 return forms
217def get_stem(word: str) -> str:
218 """Get the stem from a Greek adjective or participle."""
219 vowels = "αειουηω"
220 vowel_digraphs = ["αι", "ει", "οι", "ου", "υι"]
221 idx = len(word)
222 nword = strip_accents(word) # normalized
223 # 1. Consume every consonant until we find a vowel
224 while idx > 0 and nword[idx - 1] not in vowels:
225 idx -= 1
226 # 2. Consume the vowel/digraph
227 if idx > 2 and nword[idx - 2 : idx] in vowel_digraphs:
228 idx -= 2
229 elif idx > 1 and nword[idx - 1] in vowels: 229 ↛ 231line 229 didn't jump to line 231 because the condition on line 229 was always true
230 idx -= 1
231 return word[:idx]
234def expand_suffix_forms(forms: list[Form]) -> list[Form]:
235 """Expand headword suffix endings for Greek adjectives or participles.
237 Assume that forms are given in parsed order. That is: masc/fem/neut
239 Reference:
240 * https://el.wiktionary.org/wiki/Παράρτημα:Επίθετα_και_μετοχές_(νέα_ελληνικά)
241 * https://el.wiktionary.org/wiki/άμεσος (adjective)
242 * https://el.wiktionary.org/wiki/αρσενικός (adjective)
243 * https://el.wiktionary.org/wiki/αναμμένος (participle)
244 """
245 assert len(forms) == 3
246 base, *others = forms
247 word = base.form
248 stem = get_stem(word)
250 associated_tags = [
251 ["masculine", "singular", "nominative"],
252 ["feminine", "singular", "nominative"],
253 ["neuter", "singular", "nominative"],
254 ]
256 expanded_forms: list[Form] = []
257 new_form = base.model_copy(deep=True)
258 new_form.tags.extend(associated_tags[0])
259 expanded_forms.append(new_form)
261 for idx, form in enumerate(others, 1):
262 for ending in form.form.replace("-", "").split("/"):
263 new_form = form.model_copy(deep=True)
264 new_form.form = f"{stem}{ending}"
265 new_form.tags.extend(associated_tags[idx])
266 expanded_forms.append(new_form)
268 return expanded_forms