Coverage for src/wiktextract/extractor/el/page.py: 63%

104 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import code_to_name, name_to_code 

5 

6# NodeKind is an internal enum for WikiNode and subclasses that specifies 

7# what kind of WikiNode it is. Subclasses also have the field, but it's 

8# always NodeKind.TEMPLATE for TemplateNodes etc. 

9from wikitextprocessor.parser import LEVEL_KIND_FLAGS, NodeKind, WikiNode 

10 

11# Clean node takes a WikiNode+strings node or tree and gives you a cleanish text 

12from wiktextract.page import clean_node, clean_value 

13 

14# The main context object to more easily share state of parsing between 

15# functions. Contains WiktextractContext.wtp, which is the context for 

16# wikitextprocessor and usually holds all the good stuff. 

17from wiktextract.wxr_context import WiktextractContext 

18 

19# For debug printing when doing batches and log messages that don't make 

20# sense as word-specific debug, warning or error messages (see those 

21# in wikitextprocessor's context). 

22from wiktextract.wxr_logging import logger 

23 

24from .etymology import process_etym 

25from .models import WordEntry 

26from .parse_utils import ( 

27 POSReturns, 

28 find_sections, 

29 parse_lower_heading, 

30 strip_accents, 

31) 

32from .pos import process_pos 

33from .pronunciation import process_pron 

34from .section_titles import ( 

35 Heading, 

36 Tags, 

37) 

38 

39# from .text_utils import ENDING_NUMBER_RE 

40 

41 

42def parse_page( 

43 wxr: WiktextractContext, page_title: str, page_text: str 

44) -> list[dict[str, Any]]: 

45 """Parse Greek Wiktionary (el.wiktionary.org) page. 

46 

47 References: 

48 * https://el.wiktionary.org/wiki/Βικιλεξικό:Δομή_λημμάτων 

49 """ 

50 

51 if wxr.config.verbose: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 logger.info(f"Parsing page: {page_title}") 

53 

54 wxr.config.word = page_title 

55 wxr.wtp.start_page(page_title) 

56 

57 parts = [] 

58 parts.append(page_title) 

59 

60 # from .debug_bypass import debug_bypass 

61 # return debug_bypass(wxr, page_title, page_text) 

62 

63 if page_title.startswith("Πύλη:"): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 return [] 

65 

66 page_root = wxr.wtp.parse( 

67 page_text, 

68 ) 

69 

70 # print_tree(page_root) # WikiNode tree pretty printer 

71 word_datas: list[WordEntry] = [] 

72 

73 # stuff_outside_main_headings = page_root.invert_find_child( 

74 # LEVEL_KIND_FLAGS) 

75 

76 # Handle stuff at the very top of the page 

77 # for thing_node in stuff_outside_main_headings: 

78 # ... 

79 

80 previous_empty_language_name: str | None = None 

81 previous_empty_language_code: str | None = None 

82 

83 for level in page_root.find_child(LEVEL_KIND_FLAGS): 

84 # Contents of the heading itself; should be "Languagename". 

85 # clean_node() is the general purpose WikiNode/string -> string 

86 # implementation. Things like formatting are stripped; it mimics 

87 # the output of wikitext when possible. 

88 # == English == # <- This part 

89 # === Noun === 

90 lang_name, lang_code, ok = parse_language_name( 

91 wxr, clean_node(wxr, None, level.largs).strip() 

92 ) 

93 

94 section_num = -1 

95 

96 # print("=====") 

97 # print(f"{level=}\n => {clean_node(wxr, None, level.largs).strip()}") 

98 

99 sublevels = list(level.find_child(LEVEL_KIND_FLAGS)) 

100 

101 if not ok: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 if level.kind not in (NodeKind.LEVEL1, NodeKind.LEVEL2): 

103 # We tried to parse a lower level as a language because it 

104 # was a direct child of root and failed, so let's just ignore 

105 # it and not print a warning. 

106 continue 

107 if ( 

108 previous_empty_language_name is None 

109 or previous_empty_language_code is None 

110 ): 

111 wxr.wtp.warning( 

112 f"Can't parse language header: '{lang_name}'; " 

113 "skipping section", 

114 sortid="page/111", 

115 ) 

116 continue 

117 lang_name = previous_empty_language_name 

118 lang_code = previous_empty_language_code 

119 sublevels = [level] 

120 

121 wxr.wtp.start_section(lang_name) 

122 

123 base_data = WordEntry( 

124 word=page_title, 

125 lang_code=lang_code, 

126 lang=lang_name, 

127 pos="ERROR_UNKNOWN_POS", 

128 ) 

129 

130 prev_data: WordEntry | None = None 

131 

132 if len(sublevels) == 0 and ok: 132 ↛ 135line 132 didn't jump to line 135 because the condition on line 132 was never true

133 # Someone messed up by putting a Level 1 directly after a language 

134 # header. 

135 previous_empty_language_name = lang_name 

136 previous_empty_language_code = lang_code 

137 continue 

138 

139 previous_empty_language_name = None 

140 previous_empty_language_code = None 

141 

142 # XXX Some tables are put directly into the language level's content 

143 # Separate content and sublevels, parse content and put in base_data 

144 

145 for sublevel in sublevels: 

146 if len(sublevel.largs) == 0: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 wxr.wtp.debug( 

148 f"Sublevel without .largs: {sublevel=}", sortid="page/92" 

149 ) 

150 continue 

151 

152 heading_title = ( 

153 clean_node(wxr, None, sublevel.largs[0]).lower().strip("= \n") 

154 ) 

155 

156 type, pos, heading_name, tags, num, ok = parse_lower_heading( 

157 wxr, heading_title 

158 ) 

159 

160 section_num = num if num > section_num else section_num 

161 

162 if not ok: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 wxr.wtp.warning( 

164 f"Sub-language heading '{heading_title}' couldn't be " 

165 f"be parsed as a heading; " 

166 f"{type=}, {heading_name=}, {tags=}.", 

167 sortid="page/103/20241112", 

168 ) 

169 continue 

170 

171 if type in (Heading.Err, Heading.Ignored): 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true

172 continue 

173 ## TEMP 

174 

175 found_pos_sections: POSReturns = [] 

176 

177 if type is Heading.Etym: 

178 # Update base_data with etymology and maybe sound data. 

179 # Return any sublevels in the etymology section 

180 # so that we can check for POS sections. 

181 num, etym_sublevels = process_etym( 

182 wxr, base_data, sublevel, heading_name, section_num 

183 ) 

184 

185 section_num = num if num > section_num else section_num 

186 

187 found_pos_sections.extend(etym_sublevels) 

188 

189 # ... 

190 # text = clean_node(wxr, None, sublevel) 

191 # text = wxr.wtp.node_to_wikitext(sublevel) 

192 # if "\n=" in text: 

193 # text = "£ " + "\n£ ".join(text.splitlines()) 

194 # logger.warning(f"£ {wxr.wtp.title}\n" + text) 

195 

196 # PRINTS HERE 

197 

198 # continue 

199 

200 ## /TEMP 

201 

202 # Typical pronunciation section that applies to the whole 

203 # entry 

204 if type == Heading.Pron: 204 ↛ 208line 204 didn't jump to line 208 because the condition on line 204 was never true

205 # Update base_data with sound and hyphenation data. 

206 # Return any sublevels in the pronunciation section 

207 # so that we can check for POS sections. 

208 num, pron_sublevels = process_pron( 

209 wxr, sublevel, base_data, heading_name, section_num 

210 ) 

211 

212 section_num = num if num > section_num else section_num 

213 

214 found_pos_sections.extend(pron_sublevels) 

215 

216 if type is Heading.POS: 

217 found_pos_sections.append( 

218 ( 

219 pos, 

220 heading_name, 

221 tags, 

222 section_num, 

223 sublevel, 

224 base_data.model_copy(deep=True), 

225 ) 

226 ) 

227 

228 ################################################# 

229 # Finally handle all POS sections we've extracted 

230 for ( 

231 pos, 

232 title, 

233 tags, 

234 num, 

235 pos_section, 

236 pos_base_data, 

237 ) in found_pos_sections: 

238 if ( 238 ↛ 253line 238 didn't jump to line 253 because the condition on line 238 was always true

239 pos_ret := process_pos( 

240 wxr, 

241 pos_section, 

242 pos_base_data.model_copy(deep=True), 

243 prev_data, 

244 pos, # heading_name is the English pos 

245 title, 

246 tags, 

247 num, 

248 ) 

249 ) is not None: 

250 word_datas.append(pos_ret) 

251 prev_data = pos_ret 

252 else: 

253 wxr.wtp.error( 

254 f"Couldn't parse PoS section {pos}", 

255 sortid="page.py/20250110", 

256 ) 

257 

258 # logger.info("%%" + "\n%%".join(parts)) 

259 # Transform pydantic objects to normal dicts so that the old code can 

260 # handle them. 

261 return [wd.model_dump(exclude_defaults=True) for wd in word_datas] 

262 # return [base_data.model_dump(exclude_defaults=True)] 

263 

264 

265LANGUAGE_HEADINGS_RE = re.compile(r"([\w\s]+)\(([-\w]+)\)") 

266 

267IRREGULAR_LANGUAGE_HEADINGS = { 

268 "διαγλωσσικοί όροι": {"name": "Translingual", "code": "mul"}, 

269 "διεθνείς όροι": {"name": "Translingual", "code": "mul"}, 

270 "νέα ελληνικά (el)": {"code": "el"}, 

271 "μεσαιωνικά ελληνικά (gkm)": {"name": "Medieval Greek", "code": "gkm"}, 

272 "μεσαιωνικά ελληνικά": {"name": "Medieval Greek", "code": "gkm"}, 

273 "αρωμουνικά (βλάχικα) (roa-rup)": {"code": "roa-rup"}, 

274 "κρητικά (el-crt)": {"code": "el-crt", "name": "Cretan Greek"}, 

275 "κυπριακά (el-cyp)": {"code": "el-cyp", "name": "Cypriot Greek"}, 

276 "χαρακτήρας unicode": {"code": "mul", "name": "Translingual"}, 

277 # "": {"code": ""}, 

278} 

279 

280 

281def parse_language_name( 

282 wxr: WiktextractContext, lang_heading: str 

283) -> tuple[str, str, bool]: 

284 lang_heading = lang_heading.strip() 

285 irregulars = IRREGULAR_LANGUAGE_HEADINGS.get(lang_heading.lower(), None) 

286 if irregulars is not None: 

287 return ( 

288 irregulars.get("name") or code_to_name(irregulars["code"], "en"), 

289 irregulars["code"], 

290 True, 

291 ) 

292 

293 m = LANGUAGE_HEADINGS_RE.match(lang_heading) 

294 if m is None: 294 ↛ 306line 294 didn't jump to line 306 because the condition on line 294 was always true

295 lang_code = name_to_code(lang_heading, "el") 

296 if not lang_code: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true

297 return lang_heading, "", False 

298 english_lang_name = code_to_name(lang_code, "en") 

299 if not english_lang_name: 299 ↛ 300line 299 didn't jump to line 300 because the condition on line 299 was never true

300 wxr.wtp.warning( 

301 f"Invalid lang_code '{lang_code}'", sortid="page/194" 

302 ) 

303 return lang_heading, "", False 

304 return english_lang_name, lang_code, True 

305 else: 

306 matched_name = m.group(1).lower().strip() 

307 lang_code = m.group(2) 

308 greek_lang_name = code_to_name(lang_code, "el") 

309 english_lang_name = code_to_name(lang_code, "en") 

310 if not english_lang_name: 

311 wxr.wtp.warning( 

312 f"Invalid lang_code '{lang_code}'", sortid="page/43a" 

313 ) 

314 return lang_heading, "", False 

315 if not strip_accents(greek_lang_name).lower() == strip_accents( 

316 matched_name 

317 ): 

318 wxr.wtp.debug( 

319 f"Language code '{lang_code}' " 

320 f"Greek name '{greek_lang_name}' does not match " 

321 f"original string '{lang_heading}'; " 

322 f"outputting {english_lang_name}", 

323 sortid="page/45", 

324 ) 

325 return english_lang_name, lang_code, True