Coverage for src/wiktextract/extractor/el/pronunciation.py: 16%

81 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2from typing import cast 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree 

6 

7from wiktextract import WiktextractContext 

8from wiktextract.clean import clean_value 

9from wiktextract.page import clean_node 

10 

11# from wiktextract.wxr_logging import logger 

12from .models import Sound, WordEntry 

13from .parse_utils import POSReturns, find_sections 

14from .section_titles import Heading, POSName 

15from .tags_utils import convert_tags 

16 

17TEMPLATES_TO_IGNORE: set[str] = set( 

18 # Honestly, just ignore everything... 

19 ( 

20 "ήχος", # audio files, -> <phonos> 

21 "ομόηχ", # consonant?? 

22 ) 

23) 

24 

25IPA_TEMPLATES: set[str] = set( 

26 ( 

27 "δφα", # -> ΔΦΑ : /ˈci.klos/ 

28 ) 

29) 

30 

31HYPHEN_TEMPLATES = set( 

32 ( 

33 "συλλ", # seems to be hyphenation XXX use hyphenation data 

34 ) 

35) 

36 

37HOMOPHONES_TEMPLATES = set( 

38 ( 

39 "παρών", # tonal paronym, near-synonym, cognate 

40 "παρων", 

41 ) 

42) 

43 

44IPA_RE = re.compile(r"ΔΦΑ : /([^/]+)/") 

45 

46HYPHEN_RE = re.compile(r"τυπογραφικός συλλαβισμός : ([^\n]+)(\n|$)") 

47 

48# HOMOPHONES_RE = re.compile(r"τονικό παρώνυμο[^:]+: ([^\n]+)(\n|$)") 

49 

50HOMOPHONES_RE = re.compile(r"__HOMOPHONES__(.+)") 

51 

52 

53# Greek Wiktionary Pronunciation Sections # 

54# These tend to be super-simple and we might get away with using a 

55# template handling function that just extracts IPA templates (and others) 

56# from the content. 

57 

58 

59def process_pron( 

60 wxr: WiktextractContext, 

61 node: WikiNode, 

62 target_data: WordEntry, 

63 title: str, 

64 num: int, # Section number 

65) -> tuple[int, POSReturns]: 

66 """Process a Pronunciation section WikiNode, extracting Sound data entries 

67 which are inserted into target_data.sounds. target_data is a WordEntry, so 

68 can be base_data (used to complete other entries) or an individual POS 

69 entry.""" 

70 

71 # We save data in parse_pronunciation_template_fn into this local list, 

72 # so the template_fn has to be defined inside this larger function so 

73 # that it has easy access to sound_templates. Can't be defined outside 

74 # this function either, because we need access to `wxr` from here, and 

75 # the template_fn signature is already set in wikitextprocessor. 

76 sounds: list[Sound] = [] 

77 hyphenations: list[str] = [] 

78 

79 content: list[WikiNode] = [] 

80 sublevels: list[WikiNode] = [] 

81 

82 pos_returns: POSReturns = [] 

83 

84 wxr.wtp.start_subsection(title) 

85 

86 section_num = num 

87 

88 for child in node.children: 

89 if isinstance(child, str): 

90 # Ignore strings 

91 continue 

92 if child.kind in LEVEL_KIND_FLAGS: 

93 # Stop at first Level; everything before this is 'content', 

94 # direct children of the parent node, everything after levels 

95 # start are sublevels. 

96 sublevels.append(child) 

97 continue 

98 content.append(child) 

99 

100 def pronunciation_node_handler_fn( 

101 node: WikiNode, 

102 ) -> list[str | WikiNode] | str | None: 

103 assert isinstance(node, WikiNode) 

104 kind = node.kind 

105 if isinstance(node, TemplateNode): 

106 # Recursively expand templates so that even nodes inside the 

107 # the templates are handled with bold_node_handler. 

108 # Argh. Don't use "node_to_text", that causes bad output... 

109 tname = node.template_name.lower() 

110 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node)) 

111 new_node = wxr.wtp.parse(expanded) 

112 if tname in IPA_TEMPLATES: 

113 # print(f"{tname=}") 

114 if m := IPA_RE.search(clean_node(wxr, None, node)): 

115 # print(f"{m=}") 

116 sounds.append(Sound(ipa=m.group(1))) 

117 return [] 

118 elif tname in HYPHEN_TEMPLATES: 

119 # print(f"{tname=}") 

120 if m := HYPHEN_RE.search(clean_node(wxr, None, node)): 

121 # print(f"{m=}") 

122 hyphenations.append(m.group(1)) 

123 return [] 

124 # Ugh, XXX, homophone templates are just a placeholder for the 

125 # text "homophones", and the actual data is in the text 

126 elif tname in HOMOPHONES_TEMPLATES: 

127 return ["__HOMOPHONES__"] 

128 # if m := HOMOPHONES_RE.search(clean_node(wxr, None, node)): 

129 # sounds.append(Sound(homophones=[m.group(1)])) 

130 ret = wxr.wtp.node_to_text(new_node) 

131 return ret 

132 elif kind in { 

133 NodeKind.TABLE, 

134 }: 

135 return [*node.children] 

136 return None 

137 

138 for line in wxr.wtp.node_to_text( 

139 content, node_handler_fn=pronunciation_node_handler_fn 

140 ).splitlines(): 

141 if line.strip() == "": 

142 continue 

143 # Have to handle Homophones here because the homophone template 

144 # only generates a "homophones follow" message... 

145 if m := HOMOPHONES_RE.search(line): 

146 homophones = list( 

147 clean_value(wxr, s).strip() for s in m.group(1).split(",") 

148 ) 

149 sounds.append(Sound(homophones=homophones)) 

150 

151 for heading_type, pos, heading_title, tags, num, subnode in find_sections( 

152 wxr, sublevels 

153 ): 

154 section_num = num if num > section_num else section_num 

155 

156 if heading_type == Heading.POS: 

157 # SAFETY: Since the heading_type is POS, find_sections 

158 # "pos_or_section" is guaranteed to be a pos: POSName 

159 pos = cast(POSName, pos) 

160 pos_returns.append( 

161 ( 

162 pos, 

163 heading_title, 

164 tags, 

165 num, 

166 subnode, 

167 target_data.model_copy(deep=True), 

168 ) 

169 ) 

170 

171 # remove duplicate tags 

172 for st in sounds: 

173 legit_tags, raw_tags, poses = convert_tags(st.raw_tags) 

174 if len(legit_tags) > 0: 

175 st.tags = sorted(set(legit_tags)) 

176 st.raw_tags = sorted(set(raw_tags)) 

177 if len(poses) > 0: 

178 st.poses.extend(poses) 

179 st.poses = sorted(set(st.poses)) 

180 

181 if len(sounds) > 0: 

182 # completely replace sound data with new 

183 target_data.sounds = sounds 

184 else: 

185 target_data.sounds = [] 

186 if len(hyphenations) > 0: 

187 target_data.hyphenation += ", ".join(hyphenations) 

188 else: 

189 target_data.hyphenation = "" 

190 

191 # print(f"{sounds=}, {hyphenations=}, {target_data=}") 

192 return section_num, pos_returns