Coverage for src/wiktextract/extractor/el/parse_utils.py: 81%

94 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2import unicodedata 

3from typing import Generator, TypeAlias 

4 

5from wikitextprocessor import LevelNode, WikiNode 

6 

7from wiktextract.page import clean_node 

8from wiktextract.wxr_context import WiktextractContext 

9 

10from .models import Form, WordEntry 

11from .section_titles import ( 

12 POS_HEADINGS, 

13 POS_HEADINGS_RE, 

14 SUBSECTION_HEADINGS, 

15 SUBSECTIONS_RE, 

16 Heading, 

17 POSName, 

18 Tags, 

19) 

20from .text_utils import normalized_int 

21 

22# Ignorable templates that generate panels to the side, like 

23# Template:Wikipedia, or other meta-info like Template:see. 

24# Called 'panel templates' because they often generate panels. 

25PANEL_TEMPLATES: set[str] = set( 

26 [ 

27 "interwiktionary", 

28 "stub", 

29 "wik", 

30 "wikipedia", 

31 "Wikipedia", 

32 "wikispecies", 

33 "wikiquote", 

34 "Wikiquote", 

35 "improve", 

36 ] 

37) 

38 

39# Template name prefixes used for language-specific panel templates (i.e., 

40# templates that create side boxes or notice boxes or that should generally 

41# be ignored). 

42# PANEL_PREFIXES: set[str] = set() 

43 

44# Additional templates to be expanded in the pre-expand phase 

45# XXX nothing here yet, add as needed if some template turns out to be 

46# problematic when unexpanded. 

47ADDITIONAL_EXPAND_TEMPLATES: set[str] = set() 

48 

49# Names of templates used in etymology sections whose parameters we want 

50# to store in `etymology_templates`. 

51ETYMOLOGY_TEMPLATES: set[str] = set() 

52 

53GREEK_LANGCODES = set( 

54 ( 

55 "el", 

56 "grc", 

57 "el2", 

58 "el-crt", 

59 "el-cyp", 

60 "gkm", 

61 "gkm-cyp", 

62 "gkm-crt", 

63 "gmy", 

64 "gmy2", 

65 "grc-dor", 

66 "grc-ion", 

67 "grc-koi", 

68 "grk", 

69 "grk-ita", 

70 "grk-pro", 

71 "kath", 

72 "pnt", 

73 "pregrc", 

74 "tsd", 

75 "xme-old", 

76 "xmk", 

77 ) 

78) 

79 

80 

81Title: TypeAlias = str 

82SectionName: TypeAlias = str # The keys of SUBSECTION_HEADINGS 

83POSReturns: TypeAlias = list[ 

84 tuple[POSName, Title, Tags, int, WikiNode, WordEntry] 

85] 

86 

87 

88def find_sections( 

89 wxr: WiktextractContext, 

90 nodes: list[WikiNode] | list[LevelNode], 

91) -> Generator[ 

92 tuple[Heading, POSName | SectionName, Title, Tags, int, WikiNode], 

93 None, 

94 None, 

95]: 

96 """In practice, only called when we expect heading_type to be either 

97 Heading.POS or Heading.Pron. 

98 

99 Heading.POS guarantees that pos_or_section is a POSName. 

100 Heading.Pron guarantees that pos_or_section is a SectionName, and, looking 

101 at SUBSECTION_HEADINGS, either: "pronunciation" or "προφορά" 

102 """ 

103 for node in nodes: 103 ↛ 104line 103 didn't jump to line 104 because the loop on line 103 never started

104 heading_title = clean_node(wxr, None, node.largs[0]).lower().strip() 

105 

106 heading_type, pos_or_section, tags, num, ok = parse_lower_heading( 

107 wxr, heading_title 

108 ) 

109 

110 if num > 0: 

111 wxr.wtp.wiki_notice( 

112 f"Sub-sub-section is numbered: {heading_title}, {num=}", 

113 sortid="page/find_pos_sections_1", 

114 ) 

115 yield heading_type, pos_or_section, heading_title, tags, num, node 

116 

117 

118def parse_lower_heading( 

119 wxr: WiktextractContext, heading: str 

120) -> tuple[Heading, POSName | SectionName, Tags, int, bool]: 

121 """Determine if a heading is for a part of speech or other subsection. 

122 Returns heading type enum, POS name or string data, list of tags and a 

123 success bool. 

124 """ 

125 if m := POS_HEADINGS_RE.match(heading): 

126 heading_type, pos, tags, num, ok = parse_pos_heading(wxr, heading, m) 

127 if ok: 127 ↛ 130line 127 didn't jump to line 130 because the condition on line 127 was always true

128 return heading_type, pos, tags, num, True 

129 

130 if m := SUBSECTIONS_RE.match(heading): 130 ↛ 137line 130 didn't jump to line 137 because the condition on line 130 was always true

131 heading_type, section, tags, num, ok = parse_section_heading( 

132 wxr, heading, m 

133 ) 

134 if ok: 134 ↛ 137line 134 didn't jump to line 137 because the condition on line 134 was always true

135 return heading_type, section, tags, num, True 

136 

137 return Heading.Err, "", [], -1, False 

138 

139 

140def parse_pos_heading( 

141 wxr: WiktextractContext, heading: str, m: re.Match[str] 

142) -> tuple[Heading, POSName, Tags, int, bool]: 

143 pos_str = m.group(1) 

144 rest = m.group(2) 

145 post_number = -1 

146 if rest: 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was never true

147 # logger.info(f"POS REST: '{rest}'") 

148 if rest.strip().isdigit(): 

149 post_number = normalized_int(rest.strip()) 

150 # logger.info(f"POST_NUMBER {post_number}") 

151 pos_data = POS_HEADINGS[pos_str] 

152 return ( 

153 Heading.POS, 

154 pos_data["pos"], 

155 pos_data.get("tags", []), 

156 post_number, 

157 True, 

158 ) 

159 

160 

161def parse_section_heading( 

162 wxr: WiktextractContext, heading: str, m: re.Match[str] 

163) -> tuple[Heading, SectionName, Tags, int, bool]: 

164 subsection_str = m.group(1) 

165 rest = m.group(2) 

166 post_number = -1 

167 if rest: 167 ↛ 169line 167 didn't jump to line 169 because the condition on line 167 was never true

168 # logger.info(f"SUBSECTION REST: '{rest}'") 

169 if rest.strip().isdigit(): 

170 post_number = normalized_int(rest.strip()) 

171 # logger.info(f"POST_NUMBER {post_number}") 

172 section_data = SUBSECTION_HEADINGS[subsection_str] 

173 return ( 

174 section_data["type"], 

175 subsection_str, 

176 section_data.get("tags", []), 

177 post_number, 

178 True, 

179 ) 

180 

181 

182# https://stackoverflow.com/a/518232 

183def strip_accents(accented: str) -> str: 

184 return "".join( 

185 c 

186 for c in unicodedata.normalize("NFD", accented) 

187 if unicodedata.category(c) != "Mn" 

188 ) 

189 

190 

191def remove_duplicate_forms( 

192 wxr: WiktextractContext, forms: list[Form] 

193) -> list[Form]: 

194 """Check for identical `forms` and remove duplicates.""" 

195 if not forms: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true

196 return [] 

197 new_forms = [] 

198 for i, form in enumerate(forms): 

199 for comp in forms[i + 1 :]: 

200 if ( 

201 form.form == comp.form 

202 and form.tags == comp.tags 

203 and form.raw_tags == comp.raw_tags 

204 ): 

205 break 

206 # basically "continue" for the outer for block in this case, 

207 # but this will not trigger the following else-block 

208 else: 

209 # No duplicates found in for loop (exited without breaking) 

210 new_forms.append(form) 

211 if len(forms) > len(new_forms): 

212 # wxr.wtp.debug("Found duplicate forms", sortid="simple/pos/32") 

213 return new_forms 

214 return forms 

215 

216 

217def get_stem(word: str) -> str: 

218 """Get the stem from a Greek adjective or participle.""" 

219 vowels = "αειουηω" 

220 vowel_digraphs = ["αι", "ει", "οι", "ου", "υι"] 

221 idx = len(word) 

222 nword = strip_accents(word) # normalized 

223 # 1. Consume every consonant until we find a vowel 

224 while idx > 0 and nword[idx - 1] not in vowels: 

225 idx -= 1 

226 # 2. Consume the vowel/digraph 

227 if idx > 2 and nword[idx - 2 : idx] in vowel_digraphs: 

228 idx -= 2 

229 elif idx > 1 and nword[idx - 1] in vowels: 229 ↛ 231line 229 didn't jump to line 231 because the condition on line 229 was always true

230 idx -= 1 

231 return word[:idx] 

232 

233 

234def expand_suffix_forms(forms: list[Form]) -> list[Form]: 

235 """Expand headword suffix endings for Greek adjectives or participles. 

236 

237 Assume that forms are given in parsed order. That is: masc/fem/neut 

238 

239 Reference: 

240 * https://el.wiktionary.org/wiki/Παράρτημα:Επίθετα_και_μετοχές_(νέα_ελληνικά) 

241 * https://el.wiktionary.org/wiki/άμεσος (adjective) 

242 * https://el.wiktionary.org/wiki/αρσενικός (adjective) 

243 * https://el.wiktionary.org/wiki/αναμμένος (participle) 

244 """ 

245 assert len(forms) == 3 

246 base, *others = forms 

247 word = base.form 

248 stem = get_stem(word) 

249 

250 associated_tags = [ 

251 ["masculine", "singular", "nominative"], 

252 ["feminine", "singular", "nominative"], 

253 ["neuter", "singular", "nominative"], 

254 ] 

255 

256 expanded_forms: list[Form] = [] 

257 new_form = base.model_copy(deep=True) 

258 new_form.tags.extend(associated_tags[0]) 

259 expanded_forms.append(new_form) 

260 

261 for idx, form in enumerate(others, 1): 

262 for ending in form.form.replace("-", "").split("/"): 

263 new_form = form.model_copy(deep=True) 

264 new_form.form = f"{stem}{ending}" 

265 new_form.tags.extend(associated_tags[idx]) 

266 expanded_forms.append(new_form) 

267 

268 return expanded_forms