Coverage for src/wiktextract/extractor/el/pronunciation.py: 16%

1import re

2from typing import cast

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree

7from wiktextract import WiktextractContext

8from wiktextract.clean import clean_value

9from wiktextract.page import clean_node

11# from wiktextract.wxr_logging import logger

12from .models import Sound, WordEntry

13from .parse_utils import POSReturns, find_sections

14from .section_titles import Heading, POSName

15from .tags_utils import convert_tags

17TEMPLATES_TO_IGNORE: set[str] = set(

18 # Honestly, just ignore everything...

19 (

20 "ήχος", # audio files, -> <phonos>

21 "ομόηχ", # consonant??

22 )

23)

25IPA_TEMPLATES: set[str] = set(

26 (

27 "δφα", # -> ΔΦΑ : /ˈci.klos/

28 )

29)

31HYPHEN_TEMPLATES = set(

32 (

33 "συλλ", # seems to be hyphenation XXX use hyphenation data

34 )

35)

37HOMOPHONES_TEMPLATES = set(

38 (

39 "παρών", # tonal paronym, near-synonym, cognate

40 "παρων",

41 )

42)

44IPA_RE = re.compile(r"ΔΦΑ : /([^/]+)/")

46HYPHEN_RE = re.compile(r"τυπογραφικός συλλαβισμός : ([^\n]+)(\n|$)")

48# HOMOPHONES_RE = re.compile(r"τονικό παρώνυμο[^:]+: ([^\n]+)(\n|$)")

50HOMOPHONES_RE = re.compile(r"__HOMOPHONES__(.+)")

53# Greek Wiktionary Pronunciation Sections #

54# These tend to be super-simple and we might get away with using a

55# template handling function that just extracts IPA templates (and others)

56# from the content.

59def process_pron(

60 wxr: WiktextractContext,

61 node: WikiNode,

62 target_data: WordEntry,

63 title: str,

64 num: int, # Section number

65) -> tuple[int, POSReturns]:

66 """Process a Pronunciation section WikiNode, extracting Sound data entries

67 which are inserted into target_data.sounds. target_data is a WordEntry, so

68 can be base_data (used to complete other entries) or an individual POS

69 entry."""

71 # We save data in parse_pronunciation_template_fn into this local list,

72 # so the template_fn has to be defined inside this larger function so

73 # that it has easy access to sound_templates. Can't be defined outside

74 # this function either, because we need access to `wxr` from here, and

75 # the template_fn signature is already set in wikitextprocessor.

76 sounds: list[Sound] = []

77 hyphenations: list[str] = []

79 content: list[WikiNode] = []

80 sublevels: list[WikiNode] = []

82 pos_returns: POSReturns = []

84 wxr.wtp.start_subsection(title)

86 section_num = num

88 for child in node.children:

89 if isinstance(child, str):

90 # Ignore strings

91 continue

92 if child.kind in LEVEL_KIND_FLAGS:

93 # Stop at first Level; everything before this is 'content',

94 # direct children of the parent node, everything after levels

95 # start are sublevels.

96 sublevels.append(child)

97 continue

98 content.append(child)

100 def pronunciation_node_handler_fn(

101 node: WikiNode,

102 ) -> list[str | WikiNode] | str | None:

103 assert isinstance(node, WikiNode)

104 kind = node.kind

105 if isinstance(node, TemplateNode):

106 # Recursively expand templates so that even nodes inside the

107 # the templates are handled with bold_node_handler.

108 # Argh. Don't use "node_to_text", that causes bad output...

109 tname = node.template_name.lower()

110 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))

111 new_node = wxr.wtp.parse(expanded)

112 if tname in IPA_TEMPLATES:

113 # print(f"{tname=}")

114 if m := IPA_RE.search(clean_node(wxr, None, node)):

115 # print(f"{m=}")

116 sounds.append(Sound(ipa=m.group(1)))

117 return []

118 elif tname in HYPHEN_TEMPLATES:

119 # print(f"{tname=}")

120 if m := HYPHEN_RE.search(clean_node(wxr, None, node)):

121 # print(f"{m=}")

122 hyphenations.append(m.group(1))

123 return []

124 # Ugh, XXX, homophone templates are just a placeholder for the

125 # text "homophones", and the actual data is in the text

126 elif tname in HOMOPHONES_TEMPLATES:

127 return ["__HOMOPHONES__"]

128 # if m := HOMOPHONES_RE.search(clean_node(wxr, None, node)):

129 # sounds.append(Sound(homophones=[m.group(1)]))

130 ret = wxr.wtp.node_to_text(new_node)

131 return ret

132 elif kind in {

133 NodeKind.TABLE,

134 }:

135 return [*node.children]

136 return None

137

138 for line in wxr.wtp.node_to_text(

139 content, node_handler_fn=pronunciation_node_handler_fn

140 ).splitlines():

141 if line.strip() == "":

142 continue

143 # Have to handle Homophones here because the homophone template

144 # only generates a "homophones follow" message...

145 if m := HOMOPHONES_RE.search(line):

146 homophones = list(

147 clean_value(wxr, s).strip() for s in m.group(1).split(",")

148 )

149 sounds.append(Sound(homophones=homophones))

150

151 for heading_type, pos, heading_title, tags, num, subnode in find_sections(

152 wxr, sublevels

153 ):

154 section_num = num if num > section_num else section_num

155

156 if heading_type == Heading.POS:

157 # SAFETY: Since the heading_type is POS, find_sections

158 # "pos_or_section" is guaranteed to be a pos: POSName

159 pos = cast(POSName, pos)

160 pos_returns.append(

161 (

162 pos,

163 heading_title,

164 tags,

165 num,

166 subnode,

167 target_data.model_copy(deep=True),

168 )

169 )

170

171 # remove duplicate tags

172 for st in sounds:

173 legit_tags, raw_tags, poses = convert_tags(st.raw_tags)

174 if len(legit_tags) > 0:

175 st.tags = sorted(set(legit_tags))

176 st.raw_tags = sorted(set(raw_tags))

177 if len(poses) > 0:

178 st.poses.extend(poses)

179 st.poses = sorted(set(st.poses))

180

181 if len(sounds) > 0:

182 # completely replace sound data with new

183 target_data.sounds = sounds

184 else:

185 target_data.sounds = []

186 if len(hyphenations) > 0:

187 target_data.hyphenation += ", ".join(hyphenations)

188 else:

189 target_data.hyphenation = ""

190

191 # print(f"{sounds=}, {hyphenations=}, {target_data=}")

192 return section_num, pos_returns