Coverage for src/wiktextract/extractor/el/pronunciation.py: 14%

78 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

4from wikitextprocessor.parser import LEVEL_KIND_FLAGS # , print_tree 

5 

6from wiktextract import WiktextractContext 

7from wiktextract.page import clean_node, clean_value 

8 

9# from wiktextract.wxr_logging import logger 

10from .models import Sound, WordEntry 

11from .parse_utils import Heading, POSReturns, find_sections 

12from .tags_utils import convert_tags 

13 

14TEMPLATES_TO_IGNORE: set[str] = set( 

15 # Honestly, just ignore everything... 

16 ( 

17 "ήχος", # audio files, -> <phonos> 

18 "ομόηχ", # consonant?? 

19 ) 

20) 

21 

22IPA_TEMPLATES: set[str] = set( 

23 ( 

24 "δφα", # -> ΔΦΑ : /ˈci.klos/ 

25 ) 

26) 

27 

28HYPHEN_TEMPLATES = set( 

29 ( 

30 "συλλ", # seems to be hyphenation XXX use hyphenation data 

31 ) 

32) 

33 

34HOMOPHONES_TEMPLATES = set( 

35 ( 

36 "παρών", # tonal paronym, near-synonym, cognate 

37 "παρων", 

38 ) 

39) 

40 

41IPA_RE = re.compile(r"ΔΦΑ : /([^/]+)/") 

42 

43HYPHEN_RE = re.compile(r"τυπογραφικός συλλαβισμός : ([^\n]+)(\n|$)") 

44 

45# HOMOPHONES_RE = re.compile(r"τονικό παρώνυμο[^:]+: ([^\n]+)(\n|$)") 

46 

47HOMOPHONES_RE = re.compile(r"__HOMOPHONES__(.+)") 

48 

49 

50# Greek Wiktionary Pronunciation Sections # 

51# These tend to be super-simple and we might get away with using a 

52# template handling function that just extracts IPA templates (and others) 

53# from the content. 

54 

55 

56def process_pron( 

57 wxr: WiktextractContext, 

58 node: WikiNode, 

59 target_data: WordEntry, 

60 title: str, 

61 num: int, # Section number 

62) -> tuple[int, POSReturns]: 

63 """Process a Pronunciation section WikiNode, extracting Sound data entries 

64 which are inserted into target_data.sounds. target_data is a WordEntry, so 

65 can be base_data (used to complete other entries) or an individual POS 

66 entry.""" 

67 

68 # We save data in parse_pronunciation_template_fn into this local list, 

69 # so the template_fn has to be defined inside this larger function so 

70 # that it has easy access to sound_templates. Can't be defined outside 

71 # this function either, because we need access to `wxr` from here, and 

72 # the template_fn signature is already set in wikitextprocessor. 

73 sounds: list[Sound] = [] 

74 hyphenations: list[str] = [] 

75 

76 content: list[WikiNode] = [] 

77 sublevels: list[WikiNode] = [] 

78 

79 pos_returns: POSReturns = [] 

80 

81 wxr.wtp.start_subsection(title) 

82 

83 section_num = num 

84 

85 for child in node.children: 

86 if isinstance(child, str): 

87 # Ignore strings 

88 continue 

89 if child.kind in LEVEL_KIND_FLAGS: 

90 # Stop at first Level; everything before this is 'content', 

91 # direct children of the parent node, everything after levels 

92 # start are sublevels. 

93 sublevels.append(child) 

94 continue 

95 content.append(child) 

96 

97 

98 def pronunciation_node_handler_fn( 

99 node: WikiNode, 

100 ) -> list[str | WikiNode] | None: 

101 assert isinstance(node, WikiNode) 

102 kind = node.kind 

103 if isinstance(node, TemplateNode): 

104 # Recursively expand templates so that even nodes inside the 

105 # the templates are handled with bold_node_handler. 

106 # Argh. Don't use "node_to_text", that causes bad output... 

107 tname = node.template_name.lower() 

108 expanded = wxr.wtp.expand(wxr.wtp.node_to_wikitext(node)) 

109 new_node = wxr.wtp.parse(expanded) 

110 if tname in IPA_TEMPLATES: 

111 # print(f"{tname=}") 

112 if m := IPA_RE.search(clean_node(wxr, None, node)): 

113 # print(f"{m=}") 

114 sounds.append(Sound(ipa=m.group(1))) 

115 return [] 

116 elif tname in HYPHEN_TEMPLATES: 

117 # print(f"{tname=}") 

118 if m := HYPHEN_RE.search(clean_node(wxr, None, node)): 

119 # print(f"{m=}") 

120 hyphenations.append(m.group(1)) 

121 return [] 

122 # Ugh, XXX, homophone templates are just a placeholder for the 

123 # text "homophones", and the actual data is in the text 

124 elif tname in HOMOPHONES_TEMPLATES: 

125 return ["__HOMOPHONES__"] 

126 # if m := HOMOPHONES_RE.search(clean_node(wxr, None, node)): 

127 # sounds.append(Sound(homophones=[m.group(1)])) 

128 ret = wxr.wtp.node_to_text(new_node) 

129 return ret 

130 elif kind in { 

131 NodeKind.TABLE, 

132 }: 

133 return [*node.children] 

134 return None 

135 

136 for line in wxr.wtp.node_to_text( 

137 content, node_handler_fn=pronunciation_node_handler_fn 

138 ).splitlines(): 

139 if line.strip() == "": 

140 continue 

141 # Have to handle Homophones here because the homophone template 

142 # only generates a "homophones follow" message... 

143 if m := HOMOPHONES_RE.search(line): 

144 homophones = list( 

145 clean_value(wxr, s).strip() for s in m.group(1).split(",") 

146 ) 

147 sounds.append(Sound(homophones=homophones)) 

148 

149 for heading_type, pos, heading_name, tags, num, subnode in find_sections( 

150 wxr, sublevels 

151 ): 

152 section_num = num if num > section_num else section_num 

153 

154 if heading_type == Heading.POS: 

155 section_num = num if num > section_num else section_num 

156 pos_returns.append( 

157 ( 

158 pos, 

159 heading_name, 

160 tags, 

161 num, 

162 subnode, 

163 target_data.copy(deep=True), 

164 ) 

165 ) 

166 

167 # remove duplicate tags 

168 for st in sounds: 

169 legit_tags, raw_tags, poses = convert_tags(st.raw_tags) 

170 if len(legit_tags) > 0: 

171 st.tags = list(set(legit_tags)) 

172 st.raw_tags = list(set(raw_tags)) 

173 if len(poses) > 0: 

174 st.poses.extend(poses) 

175 st.poses = list(set(st.poses)) 

176 

177 if len(sounds) > 0: 

178 # completely replace sound data with new 

179 target_data.sounds = sounds 

180 else: 

181 target_data.sounds = [] 

182 if len(hyphenations) > 0: 

183 target_data.hyphenation += ", ".join(hyphenations) 

184 else: 

185 target_data.hyphenation = "" 

186 

187 # print(f"{sounds=}, {hyphenations=}, {target_data=}") 

188 return section_num, pos_returns