Coverage for src/wiktextract/extractor/el/parse_utils.py: 38%
64 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
2import unicodedata
3from typing import Generator, TypeAlias
5from wikitextprocessor import WikiNode
7from wiktextract.page import clean_node
8from wiktextract.wxr_context import WiktextractContext
10from .models import Form, WordEntry
11from .section_titles import (
12 POS_HEADINGS,
13 POS_HEADINGS_RE,
14 SUBSECTION_HEADINGS,
15 SUBSECTIONS_RE,
16 Heading,
17 POSName,
18 Tags,
19)
20from .text_utils import normalized_int
22# Ignorable templates that generate panels to the side, like
23# Template:Wikipedia, or other meta-info like Template:see.
24# Called 'panel templates' because they often generate panels.
25PANEL_TEMPLATES: set[str] = set(
26 [
27 "interwiktionary",
28 "stub",
29 "wik",
30 "wikipedia",
31 "Wikipedia",
32 "wikispecies",
33 "wikiquote",
34 "Wikiquote",
35 "improve",
36 ]
37)
39# Template name prefixes used for language-specific panel templates (i.e.,
40# templates that create side boxes or notice boxes or that should generally
41# be ignored).
42# PANEL_PREFIXES: set[str] = set()
44# Additional templates to be expanded in the pre-expand phase
45# XXX nothing here yet, add as needed if some template turns out to be
46# problematic when unexpanded.
47ADDITIONAL_EXPAND_TEMPLATES: set[str] = set()
49# Names of templates used in etymology sections whose parameters we want
50# to store in `etymology_templates`.
51ETYMOLOGY_TEMPLATES: set[str] = set()
53GREEK_LANGCODES = set(
54 (
55 "el",
56 "grc",
57 "el2",
58 "el-crt",
59 "el-cyp",
60 "gkm",
61 "gkm-cyp",
62 "gkm-crt",
63 "gmy",
64 "gmy2",
65 "grc-dor",
66 "grc-ion",
67 "grc-koi",
68 "grk",
69 "grk-ita",
70 "grk-pro",
71 "kath",
72 "pnt",
73 "pregrc",
74 "tsd",
75 "xme-old",
76 "xmk",
77 )
78)
81Title: TypeAlias = str
83POSReturns: TypeAlias = list[
84 tuple[POSName, Title, Tags, int, WikiNode, WordEntry]
85]
88def find_sections(
89 wxr: WiktextractContext,
90 nodes: list[WikiNode],
91) -> Generator[tuple[Heading, POSName, Title, Tags, int, WikiNode], None, None]:
92 for node in nodes:
93 heading_title = clean_node(wxr, None, node.largs[0]).lower().strip()
95 type, pos, heading_name, tags, num, ok = parse_lower_heading(
96 wxr, heading_title
97 )
99 if num > 0:
100 wxr.wtp.warning(
101 f"Sub-sub-section is numbered: {heading_name}, {num=}",
102 sortid="page/find_pos_sections_1",
103 )
104 yield type, pos, heading_name, tags, num, node
107def parse_lower_heading(
108 wxr: WiktextractContext, heading: str
109) -> tuple[Heading, str, str, Tags, int, bool]:
110 """Determine if a heading is for a part of speech or other subsection.
111 Returns heading type enum, POS name or string data, list of tags and a
112 success bool."""
113 if m := POS_HEADINGS_RE.match(heading):
114 pos, tags, num, ok = parse_pos_heading(wxr, heading, m)
115 if ok:
116 return Heading.POS, pos, heading, tags, num, True
118 if m := SUBSECTIONS_RE.match(heading):
119 section, section_name, tags, num, ok = parse_section_heading(
120 wxr, heading, m
121 )
122 if ok:
123 return section, section_name, heading, tags, num, True
125 return Heading.Err, "", heading, [], -1, False
128def parse_pos_heading(
129 wxr: WiktextractContext, heading: str, m: re.Match
130) -> tuple[POSName, Tags, int, bool]:
131 pos_str = m.group(1)
132 rest = m.group(2)
133 post_number = -1
134 if rest:
135 # logger.info(f"POS REST: '{rest}'")
136 if rest.strip().isdigit():
137 post_number = normalized_int(rest.strip())
138 # logger.info(f"POST_NUMBER {post_number}")
139 pos_data = POS_HEADINGS[pos_str]
140 return pos_data["pos"], pos_data.get("tags", []), post_number, True
143def parse_section_heading(
144 wxr: WiktextractContext, heading: str, m: re.Match
145) -> tuple[Heading, str, Tags, int, bool]:
146 subsection_str = m.group(1)
147 rest = m.group(2)
148 post_number = -1
149 if rest:
150 # logger.info(f"SUBSECTION REST: '{rest}'")
151 if rest.strip().isdigit():
152 post_number = normalized_int(rest.strip())
153 # logger.info(f"POST_NUMBER {post_number}")
154 section_data = SUBSECTION_HEADINGS[subsection_str]
155 return (
156 section_data["type"],
157 subsection_str,
158 section_data.get("tags", []),
159 post_number,
160 True,
161 )
164# https://stackoverflow.com/a/518232
165def strip_accents(accented: str) -> str:
166 return "".join(
167 c
168 for c in unicodedata.normalize("NFD", accented)
169 if unicodedata.category(c) != "Mn"
170 )
173def remove_duplicate_forms(
174 wxr: WiktextractContext, forms: list[Form]
175) -> list[Form]:
176 """Check for identical `forms` and remove duplicates."""
177 if not forms: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 return []
179 new_forms = []
180 for i, form in enumerate(forms):
181 for comp in forms[i + 1 :]:
182 if ( 182 ↛ 187line 182 didn't jump to line 187 because the condition on line 182 was never true
183 form.form == comp.form
184 and form.tags == comp.tags
185 and form.raw_tags == comp.raw_tags
186 ):
187 break
188 # basically "continue" for the outer for block in this case,
189 # but this will not trigger the following else-block
190 else:
191 # No duplicates found in for loop (exited without breaking)
192 new_forms.append(form)
193 if len(forms) > len(new_forms): 193 ↛ 195line 193 didn't jump to line 195 because the condition on line 193 was never true
194 # wxr.wtp.debug("Found duplicate forms", sortid="simple/pos/32")
195 return new_forms
196 return forms