Coverage for src/wiktextract/extractor/el/page.py: 58%

111 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2from typing import Any, cast 

3 

4from mediawiki_langcodes import code_to_name, name_to_code 

5 

6# NodeKind is an internal enum for WikiNode and subclasses that specifies 

7# what kind of WikiNode it is. Subclasses also have the field, but it's 

8# always NodeKind.TEMPLATE for TemplateNodes etc. 

9from wikitextprocessor import TemplateNode 

10from wikitextprocessor.parser import LEVEL_KIND_FLAGS, NodeKind 

11 

12# Clean node takes a WikiNode+strings node or tree and gives you a cleanish text 

13from wiktextract.extractor.el.table import process_inflection_section 

14from wiktextract.page import clean_node 

15 

16# The main context object to more easily share state of parsing between 

17# functions. Contains WiktextractContext.wtp, which is the context for 

18# wikitextprocessor and usually holds all the good stuff. 

19from wiktextract.wxr_context import WiktextractContext 

20 

21# For debug printing when doing batches and log messages that don't make 

22# sense as word-specific debug, warning or error messages (see those 

23# in wikitextprocessor's context). 

24from wiktextract.wxr_logging import logger 

25 

26from .etymology import process_etym 

27from .models import WordEntry 

28from .parse_utils import ( 

29 POSReturns, 

30 parse_lower_heading, 

31 strip_accents, 

32) 

33from .pos import process_pos 

34from .pronunciation import process_pron 

35from .section_titles import Heading, POSName 

36 

37# from .text_utils import ENDING_NUMBER_RE 

38 

39 

40def parse_page( 

41 wxr: WiktextractContext, page_title: str, page_text: str 

42) -> list[dict[str, Any]]: 

43 """Parse Greek Wiktionary (el.wiktionary.org) page. 

44 

45 References: 

46 * https://el.wiktionary.org/wiki/Βικιλεξικό:Δομή_λημμάτων 

47 """ 

48 

49 if wxr.config.verbose: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 logger.info(f"Parsing page: {page_title}") 

51 

52 wxr.config.word = page_title 

53 wxr.wtp.start_page(page_title) 

54 

55 parts = [] 

56 parts.append(page_title) 

57 

58 # from .debug_bypass import debug_bypass 

59 # return debug_bypass(wxr, page_title, page_text) 

60 

61 if page_title.startswith("Πύλη:"): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 return [] 

63 

64 page_root = wxr.wtp.parse( 

65 page_text, 

66 ) 

67 

68 # print_tree(page_root) # WikiNode tree pretty printer 

69 word_datas: list[WordEntry] = [] 

70 

71 # stuff_outside_main_headings = page_root.invert_find_child( 

72 # LEVEL_KIND_FLAGS) 

73 

74 # Handle stuff at the very top of the page 

75 # for thing_node in stuff_outside_main_headings: 

76 # ... 

77 

78 previous_empty_language_name: str | None = None 

79 previous_empty_language_code: str | None = None 

80 

81 for level in page_root.find_child(LEVEL_KIND_FLAGS): 

82 # Contents of the heading itself; should be "Languagename". 

83 # clean_node() is the general purpose WikiNode/string -> string 

84 # implementation. Things like formatting are stripped; it mimics 

85 # the output of wikitext when possible. 

86 # == English == # <- This part 

87 # === Noun === 

88 lang_name, lang_code, ok = parse_language_name( 

89 wxr, clean_node(wxr, None, level.largs).strip() 

90 ) 

91 

92 section_num = -1 

93 

94 # print("=====") 

95 # print(f"{level=}\n => {clean_node(wxr, None, level.largs).strip()}") 

96 

97 sublevels = list(level.find_child(LEVEL_KIND_FLAGS)) 

98 

99 if not ok: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 if level.kind not in (NodeKind.LEVEL1, NodeKind.LEVEL2): 

101 # We tried to parse a lower level as a language because it 

102 # was a direct child of root and failed, so let's just ignore 

103 # it and not print a warning. 

104 continue 

105 if ( 

106 previous_empty_language_name is None 

107 or previous_empty_language_code is None 

108 ): 

109 wxr.wtp.wiki_notice( 

110 f"Can't parse language header: '{lang_name}'; " 

111 "skipping section", 

112 sortid="page/111", 

113 ) 

114 continue 

115 lang_name = previous_empty_language_name 

116 lang_code = previous_empty_language_code 

117 sublevels = [level] 

118 

119 wxr.wtp.start_section(lang_name) 

120 

121 base_data = WordEntry( 

122 word=page_title, 

123 lang_code=lang_code, 

124 lang=lang_name, 

125 pos="ERROR_UNKNOWN_POS", 

126 ) 

127 

128 prev_data: WordEntry | None = None 

129 

130 if len(sublevels) == 0 and ok: 130 ↛ 133line 130 didn't jump to line 133 because the condition on line 130 was never true

131 # Someone messed up by putting a Level 1 directly after a language 

132 # header. 

133 previous_empty_language_name = lang_name 

134 previous_empty_language_code = lang_code 

135 continue 

136 

137 previous_empty_language_name = None 

138 previous_empty_language_code = None 

139 

140 # Parse tables directly into the language level's content. 

141 # Ex. from https://el.wiktionary.org/wiki/αμάξι 

142 # =={{-el-}}== 

143 # {{el-κλίση-'τραγούδι'}} <= THIS 

144 # ... 

145 # 

146 # Notes: 

147 # * Only support Modern Greek pages at the moment. 

148 # * There can be more than one inflection: ρολόι, πλάγιος 

149 if ( 149 ↛ 165line 149 didn't jump to line 165 because the condition on line 149 was always true

150 level.kind == NodeKind.LEVEL2 

151 and level.largs 

152 and clean_node(wxr, None, level.largs[0]) == "Νέα ελληνικά (el)" 

153 ): 

154 for child in level.children: 

155 if isinstance( 

156 child, TemplateNode 

157 ) and child.template_name.startswith("el-κλίση"): 

158 process_inflection_section( 

159 wxr, 

160 base_data, 

161 child, 

162 source="declension", 

163 ) 

164 

165 for sublevel in sublevels: 

166 if len(sublevel.largs) == 0: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 wxr.wtp.debug( 

168 f"Sublevel without .largs: {sublevel=}", sortid="page/92" 

169 ) 

170 continue 

171 

172 heading_title = ( 

173 clean_node(wxr, None, sublevel.largs[0]).lower().strip("= \n") 

174 ) 

175 

176 heading_type, pos, tags, num, ok = parse_lower_heading( 

177 wxr, heading_title 

178 ) 

179 

180 section_num = num if num > section_num else section_num 

181 

182 if not ok: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 wxr.wtp.wiki_notice( 

184 f"Sub-language heading '{heading_title}' couldn't be " 

185 f"be parsed as a heading; " 

186 f"{heading_type=}, {heading_title=}, {tags=}.", 

187 sortid="page/103/20241112", 

188 ) 

189 continue 

190 

191 if heading_type in (Heading.Err, Heading.Ignored): 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true

192 continue 

193 ## TEMP 

194 

195 found_pos_sections: POSReturns = [] 

196 

197 if heading_type is Heading.Etym: 

198 # Update base_data with etymology and maybe sound data. 

199 # Return any sublevels in the etymology section 

200 # so that we can check for POS sections. 

201 num, etym_sublevels = process_etym( 

202 wxr, base_data, sublevel, heading_title, section_num 

203 ) 

204 

205 section_num = num if num > section_num else section_num 

206 

207 found_pos_sections.extend(etym_sublevels) 

208 

209 # ... 

210 # text = clean_node(wxr, None, sublevel) 

211 # text = wxr.wtp.node_to_wikitext(sublevel) 

212 # if "\n=" in text: 

213 # text = "£ " + "\n£ ".join(text.splitlines()) 

214 # logger.warning(f"£ {wxr.wtp.title}\n" + text) 

215 

216 # PRINTS HERE 

217 

218 # continue 

219 

220 ## /TEMP 

221 

222 # Typical pronunciation section that applies to the whole 

223 # entry 

224 if heading_type == Heading.Pron: 224 ↛ 228line 224 didn't jump to line 228 because the condition on line 224 was never true

225 # Update base_data with sound and hyphenation data. 

226 # Return any sublevels in the pronunciation section 

227 # so that we can check for POS sections. 

228 num, pron_sublevels = process_pron( 

229 wxr, sublevel, base_data, heading_title, section_num 

230 ) 

231 

232 section_num = num if num > section_num else section_num 

233 

234 found_pos_sections.extend(pron_sublevels) 

235 

236 if heading_type == Heading.POS: 

237 # SAFETY: Since the heading_type is POS, parse_lower_heading 

238 # "pos_or_section" is guaranteed to be a pos: POSName 

239 pos = cast(POSName, pos) 

240 found_pos_sections.append( 

241 ( 

242 pos, 

243 heading_title, 

244 tags, 

245 section_num, 

246 sublevel, 

247 base_data.model_copy(deep=True), 

248 ) 

249 ) 

250 

251 ################################################# 

252 # Finally handle all POS sections we've extracted 

253 for ( 

254 pos, 

255 title, 

256 tags, 

257 num, 

258 pos_section, 

259 pos_base_data, 

260 ) in found_pos_sections: 

261 if ( 261 ↛ 276line 261 didn't jump to line 276 because the condition on line 261 was always true

262 pos_ret := process_pos( 

263 wxr, 

264 pos_section, 

265 pos_base_data.model_copy(deep=True), 

266 prev_data, 

267 pos, 

268 title, 

269 tags, 

270 num, 

271 ) 

272 ) is not None: 

273 word_datas.append(pos_ret) 

274 prev_data = pos_ret 

275 else: 

276 wxr.wtp.error( 

277 f"Couldn't parse PoS section {pos}", 

278 sortid="page.py/20250110", 

279 ) 

280 

281 # logger.info("%%" + "\n%%".join(parts)) 

282 # Transform pydantic objects to normal dicts so that the old code can 

283 # handle them. 

284 return [wd.model_dump(exclude_defaults=True) for wd in word_datas] 

285 # return [base_data.model_dump(exclude_defaults=True)] 

286 

287 

288LANGUAGE_HEADINGS_RE = re.compile(r"([\w\s]+)\(([-\w]+)\)") 

289 

290IRREGULAR_LANGUAGE_HEADINGS = { 

291 "διαγλωσσικοί όροι": {"name": "Translingual", "code": "mul"}, 

292 "διεθνείς όροι": {"name": "Translingual", "code": "mul"}, 

293 "νέα ελληνικά (el)": {"code": "el"}, 

294 "μεσαιωνικά ελληνικά (gkm)": {"name": "Medieval Greek", "code": "gkm"}, 

295 "μεσαιωνικά ελληνικά": {"name": "Medieval Greek", "code": "gkm"}, 

296 "αρωμουνικά (βλάχικα) (roa-rup)": {"code": "roa-rup"}, 

297 "κρητικά (el-crt)": {"code": "el-crt", "name": "Cretan Greek"}, 

298 "κυπριακά (el-cyp)": {"code": "el-cyp", "name": "Cypriot Greek"}, 

299 "χαρακτήρας unicode": {"code": "mul", "name": "Translingual"}, 

300 # "": {"code": ""}, 

301} 

302 

303 

304def parse_language_name( 

305 wxr: WiktextractContext, lang_heading: str 

306) -> tuple[str, str, bool]: 

307 lang_heading = lang_heading.strip() 

308 irregulars = IRREGULAR_LANGUAGE_HEADINGS.get(lang_heading.lower(), None) 

309 if irregulars is not None: 309 ↛ 316line 309 didn't jump to line 316 because the condition on line 309 was always true

310 return ( 

311 irregulars.get("name") or code_to_name(irregulars["code"], "en"), 

312 irregulars["code"], 

313 True, 

314 ) 

315 

316 m = LANGUAGE_HEADINGS_RE.match(lang_heading) 

317 if m is None: 

318 lang_code = name_to_code(lang_heading, "el") 

319 if not lang_code: 

320 return lang_heading, "", False 

321 english_lang_name = code_to_name(lang_code, "en") 

322 if not english_lang_name: 

323 wxr.wtp.warning( 

324 f"Invalid lang_code '{lang_code}'", sortid="page/194" 

325 ) 

326 return lang_heading, "", False 

327 return english_lang_name, lang_code, True 

328 else: 

329 matched_name = m.group(1).lower().strip() 

330 lang_code = m.group(2) 

331 greek_lang_name = code_to_name(lang_code, "el") 

332 english_lang_name = code_to_name(lang_code, "en") 

333 if not english_lang_name: 

334 wxr.wtp.warning( 

335 f"Invalid lang_code '{lang_code}'", sortid="page/43a" 

336 ) 

337 return lang_heading, "", False 

338 if not strip_accents(greek_lang_name).lower() == strip_accents( 

339 matched_name 

340 ): 

341 wxr.wtp.debug( 

342 f"Language code '{lang_code}' " 

343 f"Greek name '{greek_lang_name}' does not match " 

344 f"original string '{lang_heading}'; " 

345 f"outputting {english_lang_name}", 

346 sortid="page/45", 

347 ) 

348 return english_lang_name, lang_code, True