Coverage for src/wiktextract/extractor/el/pronunciation.py: 14%
78 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
3from wikitextprocessor import NodeKind, TemplateNode, WikiNode
4from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree
6from wiktextract import WiktextractContext
7from wiktextract.page import clean_node, clean_value
9# from wiktextract.wxr_logging import logger
10from .models import Sound, WordEntry
11from .parse_utils import Heading, POSReturns, find_sections
12from .tags_utils import convert_tags
14TEMPLATES_TO_IGNORE: set[str] = set(
15 # Honestly, just ignore everything...
16 (
17 "ήχος", # audio files, -> <phonos>
18 "ομόηχ", # consonant??
19 )
20)
22IPA_TEMPLATES: set[str] = set(
23 (
24 "δφα", # -> ΔΦΑ : /ˈci.klos/
25 )
26)
28HYPHEN_TEMPLATES = set(
29 (
30 "συλλ", # seems to be hyphenation XXX use hyphenation data
31 )
32)
34HOMOPHONES_TEMPLATES = set(
35 (
36 "παρών", # tonal paronym, near-synonym, cognate
37 "παρων",
38 )
39)
41IPA_RE = re.compile(r"ΔΦΑ : /([^/]+)/")
43HYPHEN_RE = re.compile(r"τυπογραφικός συλλαβισμός : ([^\n]+)(\n|$)")
45# HOMOPHONES_RE = re.compile(r"τονικό παρώνυμο[^:]+: ([^\n]+)(\n|$)")
47HOMOPHONES_RE = re.compile(r"__HOMOPHONES__(.+)")
50# Greek Wiktionary Pronunciation Sections #
51# These tend to be super-simple and we might get away with using a
52# template handling function that just extracts IPA templates (and others)
53# from the content.
56def process_pron(
57 wxr: WiktextractContext,
58 node: WikiNode,
59 target_data: WordEntry,
60 title: str,
61 num: int, # Section number
62) -> tuple[int, POSReturns]:
63 """Process a Pronunciation section WikiNode, extracting Sound data entries
64 which are inserted into target_data.sounds. target_data is a WordEntry, so
65 can be base_data (used to complete other entries) or an individual POS
66 entry."""
68 # We save data in parse_pronunciation_template_fn into this local list,
69 # so the template_fn has to be defined inside this larger function so
70 # that it has easy access to sound_templates. Can't be defined outside
71 # this function either, because we need access to `wxr` from here, and
72 # the template_fn signature is already set in wikitextprocessor.
73 sounds: list[Sound] = []
74 hyphenations: list[str] = []
76 content: list[WikiNode] = []
77 sublevels: list[WikiNode] = []
79 pos_returns: POSReturns = []
81 wxr.wtp.start_subsection(title)
83 section_num = num
85 for child in node.children:
86 if isinstance(child, str):
87 # Ignore strings
88 continue
89 if child.kind in LEVEL_KIND_FLAGS:
90 # Stop at first Level; everything before this is 'content',
91 # direct children of the parent node, everything after levels
92 # start are sublevels.
93 sublevels.append(child)
94 continue
95 content.append(child)
98 def pronunciation_node_handler_fn(
99 node: WikiNode,
100 ) -> list[str | WikiNode] | None:
101 assert isinstance(node, WikiNode)
102 kind = node.kind
103 if isinstance(node, TemplateNode):
104 # Recursively expand templates so that even nodes inside the
105 # the templates are handled with bold_node_handler.
106 # Argh. Don't use "node_to_text", that causes bad output...
107 tname = node.template_name.lower()
108 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))
109 new_node = wxr.wtp.parse(expanded)
110 if tname in IPA_TEMPLATES:
111 # print(f"{tname=}")
112 if m := IPA_RE.search(clean_node(wxr, None, node)):
113 # print(f"{m=}")
114 sounds.append(Sound(ipa=m.group(1)))
115 return []
116 elif tname in HYPHEN_TEMPLATES:
117 # print(f"{tname=}")
118 if m := HYPHEN_RE.search(clean_node(wxr, None, node)):
119 # print(f"{m=}")
120 hyphenations.append(m.group(1))
121 return []
122 # Ugh, XXX, homophone templates are just a placeholder for the
123 # text "homophones", and the actual data is in the text
124 elif tname in HOMOPHONES_TEMPLATES:
125 return ["__HOMOPHONES__"]
126 # if m := HOMOPHONES_RE.search(clean_node(wxr, None, node)):
127 # sounds.append(Sound(homophones=[m.group(1)]))
128 ret = wxr.wtp.node_to_text(new_node)
129 return ret
130 elif kind in {
131 NodeKind.TABLE,
132 }:
133 return [*node.children]
134 return None
136 for line in wxr.wtp.node_to_text(
137 content, node_handler_fn=pronunciation_node_handler_fn
138 ).splitlines():
139 if line.strip() == "":
140 continue
141 # Have to handle Homophones here because the homophone template
142 # only generates a "homophones follow" message...
143 if m := HOMOPHONES_RE.search(line):
144 homophones = list(
145 clean_value(wxr, s).strip() for s in m.group(1).split(",")
146 )
147 sounds.append(Sound(homophones=homophones))
149 for heading_type, pos, heading_name, tags, num, subnode in find_sections(
150 wxr, sublevels
151 ):
152 section_num = num if num > section_num else section_num
154 if heading_type == Heading.POS:
155 section_num = num if num > section_num else section_num
156 pos_returns.append(
157 (
158 pos,
159 heading_name,
160 tags,
161 num,
162 subnode,
163 target_data.copy(deep=True),
164 )
165 )
167 # remove duplicate tags
168 for st in sounds:
169 legit_tags, raw_tags, poses = convert_tags(st.raw_tags)
170 if len(legit_tags) > 0:
171 st.tags = list(set(legit_tags))
172 st.raw_tags = list(set(raw_tags))
173 if len(poses) > 0:
174 st.poses.extend(poses)
175 st.poses = list(set(st.poses))
177 if len(sounds) > 0:
178 # completely replace sound data with new
179 target_data.sounds = sounds
180 else:
181 target_data.sounds = []
182 if len(hyphenations) > 0:
183 target_data.hyphenation += ", ".join(hyphenations)
184 else:
185 target_data.hyphenation = ""
187 # print(f"{sounds=}, {hyphenations=}, {target_data=}")
188 return section_num, pos_returns