Coverage for src/wiktextract/extractor/el/pronunciation.py: 16%
81 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
2from typing import cast
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree
7from wiktextract import WiktextractContext
8from wiktextract.clean import clean_value
9from wiktextract.page import clean_node
11# from wiktextract.wxr_logging import logger
12from .models import Sound, WordEntry
13from .parse_utils import POSReturns, find_sections
14from .section_titles import Heading, POSName
15from .tags_utils import convert_tags
17TEMPLATES_TO_IGNORE: set[str] = set(
18 # Honestly, just ignore everything...
19 (
20 "ήχος", # audio files, -> <phonos>
21 "ομόηχ", # consonant??
22 )
23)
25IPA_TEMPLATES: set[str] = set(
26 (
27 "δφα", # -> ΔΦΑ : /ˈci.klos/
28 )
29)
31HYPHEN_TEMPLATES = set(
32 (
33 "συλλ", # seems to be hyphenation XXX use hyphenation data
34 )
35)
37HOMOPHONES_TEMPLATES = set(
38 (
39 "παρών", # tonal paronym, near-synonym, cognate
40 "παρων",
41 )
42)
44IPA_RE = re.compile(r"ΔΦΑ : /([^/]+)/")
46HYPHEN_RE = re.compile(r"τυπογραφικός συλλαβισμός : ([^\n]+)(\n|$)")
48# HOMOPHONES_RE = re.compile(r"τονικό παρώνυμο[^:]+: ([^\n]+)(\n|$)")
50HOMOPHONES_RE = re.compile(r"__HOMOPHONES__(.+)")
53# Greek Wiktionary Pronunciation Sections #
54# These tend to be super-simple and we might get away with using a
55# template handling function that just extracts IPA templates (and others)
56# from the content.
59def process_pron(
60 wxr: WiktextractContext,
61 node: WikiNode,
62 target_data: WordEntry,
63 title: str,
64 num: int, # Section number
65) -> tuple[int, POSReturns]:
66 """Process a Pronunciation section WikiNode, extracting Sound data entries
67 which are inserted into target_data.sounds. target_data is a WordEntry, so
68 can be base_data (used to complete other entries) or an individual POS
69 entry."""
71 # We save data in parse_pronunciation_template_fn into this local list,
72 # so the template_fn has to be defined inside this larger function so
73 # that it has easy access to sound_templates. Can't be defined outside
74 # this function either, because we need access to `wxr` from here, and
75 # the template_fn signature is already set in wikitextprocessor.
76 sounds: list[Sound] = []
77 hyphenations: list[str] = []
79 content: list[WikiNode] = []
80 sublevels: list[WikiNode] = []
82 pos_returns: POSReturns = []
84 wxr.wtp.start_subsection(title)
86 section_num = num
88 for child in node.children:
89 if isinstance(child, str):
90 # Ignore strings
91 continue
92 if child.kind in LEVEL_KIND_FLAGS:
93 # Stop at first Level; everything before this is 'content',
94 # direct children of the parent node, everything after levels
95 # start are sublevels.
96 sublevels.append(child)
97 continue
98 content.append(child)
100 def pronunciation_node_handler_fn(
101 node: WikiNode,
102 ) -> list[str | WikiNode] | str | None:
103 assert isinstance(node, WikiNode)
104 kind = node.kind
105 if isinstance(node, TemplateNode):
106 # Recursively expand templates so that even nodes inside the
107 # the templates are handled with bold_node_handler.
108 # Argh. Don't use "node_to_text", that causes bad output...
109 tname = node.template_name.lower()
110 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
111 new_node = wxr.wtp.parse(expanded)
112 if tname in IPA_TEMPLATES:
113 # print(f"{tname=}")
114 if m := IPA_RE.search(clean_node(wxr, None, node)):
115 # print(f"{m=}")
116 sounds.append(Sound(ipa=m.group(1)))
117 return []
118 elif tname in HYPHEN_TEMPLATES:
119 # print(f"{tname=}")
120 if m := HYPHEN_RE.search(clean_node(wxr, None, node)):
121 # print(f"{m=}")
122 hyphenations.append(m.group(1))
123 return []
124 # Ugh, XXX, homophone templates are just a placeholder for the
125 # text "homophones", and the actual data is in the text
126 elif tname in HOMOPHONES_TEMPLATES:
127 return ["__HOMOPHONES__"]
128 # if m := HOMOPHONES_RE.search(clean_node(wxr, None, node)):
129 # sounds.append(Sound(homophones=[m.group(1)]))
130 ret = wxr.wtp.node_to_text(new_node)
131 return ret
132 elif kind in {
133 NodeKind.TABLE,
134 }:
135 return [*node.children]
136 return None
138 for line in wxr.wtp.node_to_text(
139 content, node_handler_fn=pronunciation_node_handler_fn
140 ).splitlines():
141 if line.strip() == "":
142 continue
143 # Have to handle Homophones here because the homophone template
144 # only generates a "homophones follow" message...
145 if m := HOMOPHONES_RE.search(line):
146 homophones = list(
147 clean_value(wxr, s).strip() for s in m.group(1).split(",")
148 )
149 sounds.append(Sound(homophones=homophones))
151 for heading_type, pos, heading_title, tags, num, subnode in find_sections(
152 wxr, sublevels
153 ):
154 section_num = num if num > section_num else section_num
156 if heading_type == Heading.POS:
157 # SAFETY: Since the heading_type is POS, find_sections
158 # "pos_or_section" is guaranteed to be a pos: POSName
159 pos = cast(POSName, pos)
160 pos_returns.append(
161 (
162 pos,
163 heading_title,
164 tags,
165 num,
166 subnode,
167 target_data.model_copy(deep=True),
168 )
169 )
171 # remove duplicate tags
172 for st in sounds:
173 legit_tags, raw_tags, poses = convert_tags(st.raw_tags)
174 if len(legit_tags) > 0:
175 st.tags = sorted(set(legit_tags))
176 st.raw_tags = sorted(set(raw_tags))
177 if len(poses) > 0:
178 st.poses.extend(poses)
179 st.poses = sorted(set(st.poses))
181 if len(sounds) > 0:
182 # completely replace sound data with new
183 target_data.sounds = sounds
184 else:
185 target_data.sounds = []
186 if len(hyphenations) > 0:
187 target_data.hyphenation += ", ".join(hyphenations)
188 else:
189 target_data.hyphenation = ""
191 # print(f"{sounds=}, {hyphenations=}, {target_data=}")
192 return section_num, pos_returns