Coverage for src / wiktextract / extractor / el / page.py: 67%

109 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from typing import Any, cast 

3 

4from mediawiki_langcodes import code_to_name, name_to_code 

5 

6# NodeKind is an internal enum for WikiNode and subclasses that specifies 

7# what kind of WikiNode it is. Subclasses also have the field, but it's 

8# always NodeKind.TEMPLATE for TemplateNodes etc. 

9from wikitextprocessor import TemplateNode 

10from wikitextprocessor.parser import LEVEL_KIND_FLAGS, NodeKind 

11 

12# Clean node takes a WikiNode+strings node or tree and gives you a cleanish text 

13from wiktextract.extractor.el.table import process_inflection_section 

14from wiktextract.page import clean_node 

15 

16# The main context object to more easily share state of parsing between 

17# functions. Contains WiktextractContext.wtp, which is the context for 

18# wikitextprocessor and usually holds all the good stuff. 

19from wiktextract.wxr_context import WiktextractContext 

20 

21# For debug printing when doing batches and log messages that don't make 

22# sense as word-specific debug, warning or error messages (see those 

23# in wikitextprocessor's context). 

24from wiktextract.wxr_logging import logger 

25 

26from .etymology import process_etym 

27from .models import WordEntry 

28from .parse_utils import ( 

29 POSReturns, 

30 parse_lower_heading, 

31 strip_accents, 

32) 

33from .pos import process_pos 

34from .pronunciation import process_pron 

35from .section_titles import Heading, POSName 

36 

37 

38def parse_page( 

39 wxr: WiktextractContext, page_title: str, page_text: str 

40) -> list[dict[str, Any]]: 

41 """Parse Greek Wiktionary (el.wiktionary.org) page. 

42 

43 References: 

44 * https://el.wiktionary.org/wiki/Βικιλεξικό:Δομή_λημμάτων 

45 """ 

46 

47 if wxr.config.verbose: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 logger.info(f"Parsing page: {page_title}") 

49 

50 wxr.config.word = page_title 

51 wxr.wtp.start_page(page_title) 

52 

53 # from .debug_bypass import debug_bypass 

54 # return debug_bypass(wxr, page_title, page_text) 

55 

56 if page_title.startswith("Πύλη:"): 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 return [] 

58 

59 page_root = wxr.wtp.parse(page_text) 

60 

61 # print_tree(page_root) # WikiNode tree pretty printer 

62 word_data: list[WordEntry] = [] 

63 

64 # stuff_outside_main_headings = page_root.invert_find_child( 

65 # LEVEL_KIND_FLAGS) 

66 

67 # Handle stuff at the very top of the page 

68 # for thing_node in stuff_outside_main_headings: 

69 # ... 

70 

71 previous_empty_language_name: str | None = None 

72 previous_empty_language_code: str | None = None 

73 

74 for level in page_root.find_child(LEVEL_KIND_FLAGS): 

75 # Contents of the heading itself; should be "Languagename". 

76 # clean_node() is the general purpose WikiNode/string -> string 

77 # implementation. Things like formatting are stripped; it mimics 

78 # the output of wikitext when possible. 

79 # == English == # <- This part 

80 # === Noun === 

81 lang_name, lang_code, ok = parse_language_name( 

82 wxr, clean_node(wxr, None, level.largs).strip() 

83 ) 

84 

85 section_num = -1 

86 

87 # print("=====") 

88 # print(f"{level=}\n => {clean_node(wxr, None, level.largs).strip()}") 

89 

90 sublevels = list(level.find_child(LEVEL_KIND_FLAGS)) 

91 

92 if not ok: 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 if level.kind not in (NodeKind.LEVEL1, NodeKind.LEVEL2): 

94 # We tried to parse a lower level as a language because it 

95 # was a direct child of root and failed, so let's just ignore 

96 # it and not print a warning. 

97 continue 

98 if ( 

99 previous_empty_language_name is None 

100 or previous_empty_language_code is None 

101 ): 

102 wxr.wtp.wiki_notice( 

103 f"Can't parse language header: '{lang_name}'; " 

104 "skipping section", 

105 sortid="page/111", 

106 ) 

107 continue 

108 lang_name = previous_empty_language_name 

109 lang_code = previous_empty_language_code 

110 sublevels = [level] 

111 

112 wxr.wtp.start_section(lang_name) 

113 

114 base_data = WordEntry( 

115 word=page_title, 

116 lang_code=lang_code, 

117 lang=lang_name, 

118 pos="ERROR_UNKNOWN_POS", 

119 ) 

120 

121 prev_data: WordEntry | None = None 

122 

123 if len(sublevels) == 0 and ok: 123 ↛ 126line 123 didn't jump to line 126 because the condition on line 123 was never true

124 # Someone messed up by putting a Level 1 directly after a language 

125 # header. 

126 previous_empty_language_name = lang_name 

127 previous_empty_language_code = lang_code 

128 continue 

129 

130 previous_empty_language_name = None 

131 previous_empty_language_code = None 

132 

133 # Parse tables directly into the language level's content. 

134 # Ex. from https://el.wiktionary.org/wiki/αμάξι 

135 # =={{-el-}}== 

136 # {{el-κλίση-'τραγούδι'}} <= THIS 

137 # ... 

138 # 

139 # Notes: 

140 # * Only support Modern Greek pages at the moment. 

141 # * There can be more than one inflection: ρολόι, πλάγιος 

142 if ( 

143 level.kind == NodeKind.LEVEL2 

144 and level.largs 

145 and clean_node(wxr, None, level.largs[0]) == "Νέα ελληνικά (el)" 

146 ): 

147 for child in level.children: 

148 if isinstance( 

149 child, TemplateNode 

150 ) and child.template_name.startswith("el-κλίση"): 

151 process_inflection_section( 

152 wxr, 

153 base_data, 

154 child, 

155 source="declension", 

156 ) 

157 

158 for sublevel in sublevels: 

159 if len(sublevel.largs) == 0: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 wxr.wtp.debug( 

161 f"Sublevel without .largs: {sublevel=}", sortid="page/92" 

162 ) 

163 continue 

164 

165 heading_title = ( 

166 clean_node(wxr, None, sublevel.largs[0]).lower().strip("= \n") 

167 ) 

168 

169 heading_type, pos, tags, num, ok = parse_lower_heading( 

170 wxr, heading_title 

171 ) 

172 

173 section_num = num if num > section_num else section_num 

174 

175 if not ok: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 wxr.wtp.wiki_notice( 

177 f"Sub-language heading '{heading_title}' couldn't be " 

178 f"be parsed as a heading; " 

179 f"{heading_type=}, {heading_title=}, {tags=}.", 

180 sortid="page/103/20241112", 

181 ) 

182 continue 

183 

184 if heading_type in (Heading.Err, Heading.Ignored): 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true

185 continue 

186 ## TEMP 

187 

188 found_pos_sections: POSReturns = [] 

189 

190 if heading_type == Heading.Etym: 

191 # Update base_data with etymology and maybe sound data. 

192 # Return any sublevels in the etymology section 

193 # so that we can check for POS sections. 

194 num, etym_sublevels = process_etym( 

195 wxr, base_data, sublevel, heading_title, section_num 

196 ) 

197 

198 section_num = num if num > section_num else section_num 

199 

200 found_pos_sections.extend(etym_sublevels) 

201 

202 # ... 

203 # text = clean_node(wxr, None, sublevel) 

204 # text = wxr.wtp.node_to_wikitext(sublevel) 

205 # if "\n=" in text: 

206 # text = "£ " + "\n£ ".join(text.splitlines()) 

207 # logger.warning(f"£ {wxr.wtp.title}\n" + text) 

208 

209 # PRINTS HERE 

210 

211 ## /TEMP 

212 

213 # Typical pronunciation section that applies to the whole 

214 # entry 

215 if heading_type == Heading.Pron: 215 ↛ 219line 215 didn't jump to line 219 because the condition on line 215 was never true

216 # Update base_data with sound and hyphenation data. 

217 # Return any sublevels in the pronunciation section 

218 # so that we can check for POS sections. 

219 num, pron_sublevels = process_pron( 

220 wxr, sublevel, base_data, heading_title, section_num 

221 ) 

222 

223 section_num = num if num > section_num else section_num 

224 

225 found_pos_sections.extend(pron_sublevels) 

226 

227 if heading_type == Heading.POS: 

228 # SAFETY: Since the heading_type is POS, parse_lower_heading 

229 # "pos_or_section" is guaranteed to be a pos: POSName 

230 pos = cast(POSName, pos) 

231 found_pos_sections.append( 

232 ( 

233 pos, 

234 heading_title, 

235 tags, 

236 section_num, 

237 sublevel, 

238 base_data.model_copy(deep=True), 

239 ) 

240 ) 

241 

242 ################################################# 

243 # Finally handle all POS sections we've extracted 

244 for ( 

245 pos, 

246 title, 

247 tags, 

248 num, 

249 pos_section, 

250 pos_base_data, 

251 ) in found_pos_sections: 

252 if ( 252 ↛ 267line 252 didn't jump to line 267 because the condition on line 252 was always true

253 pos_ret := process_pos( 

254 wxr, 

255 pos_section, 

256 pos_base_data.model_copy(deep=True), 

257 prev_data, 

258 pos, 

259 title, 

260 tags, 

261 num, 

262 ) 

263 ) is not None: 

264 word_data.append(pos_ret) 

265 prev_data = pos_ret 

266 else: 

267 wxr.wtp.error( 

268 f"Couldn't parse PoS section {pos}", 

269 sortid="page.py/20250110", 

270 ) 

271 

272 # Transform pydantic objects to normal dicts so that the old code can 

273 # handle them. 

274 return [wd.model_dump(exclude_defaults=True) for wd in word_data] 

275 # return [base_data.model_dump(exclude_defaults=True)] 

276 

277 

278LANGUAGE_HEADINGS_RE = re.compile(r"([\w\s]+)\(([-\w]+)\)") 

279 

280IRREGULAR_LANGUAGE_HEADINGS = { 

281 "διαγλωσσικοί όροι": {"name": "Translingual", "code": "mul"}, 

282 "διεθνείς όροι": {"name": "Translingual", "code": "mul"}, 

283 "νέα ελληνικά (el)": {"code": "el"}, 

284 "μεσαιωνικά ελληνικά (gkm)": {"name": "Medieval Greek", "code": "gkm"}, 

285 "μεσαιωνικά ελληνικά": {"name": "Medieval Greek", "code": "gkm"}, 

286 "αρωμουνικά (βλάχικα) (roa-rup)": {"code": "roa-rup"}, 

287 "κρητικά (el-crt)": {"code": "el-crt", "name": "Cretan Greek"}, 

288 "κυπριακά (el-cyp)": {"code": "el-cyp", "name": "Cypriot Greek"}, 

289 "χαρακτήρας unicode": {"code": "mul", "name": "Translingual"}, 

290 # "": {"code": ""}, 

291} 

292 

293 

294def parse_language_name( 

295 wxr: WiktextractContext, lang_heading: str 

296) -> tuple[str, str, bool]: 

297 lang_heading = lang_heading.strip() 

298 irregulars = IRREGULAR_LANGUAGE_HEADINGS.get(lang_heading.lower(), None) 

299 if irregulars is not None: 

300 return ( 

301 irregulars.get("name") or code_to_name(irregulars["code"], "en"), 

302 irregulars["code"], 

303 True, 

304 ) 

305 

306 m = LANGUAGE_HEADINGS_RE.match(lang_heading) 

307 if m is None: 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true

308 lang_code = name_to_code(lang_heading, "el") 

309 if not lang_code: 

310 return lang_heading, "", False 

311 english_lang_name = code_to_name(lang_code, "en") 

312 if not english_lang_name: 

313 wxr.wtp.warning( 

314 f"Invalid lang_code '{lang_code}'", sortid="page/194" 

315 ) 

316 return lang_heading, "", False 

317 return english_lang_name, lang_code, True 

318 else: 

319 matched_name = m.group(1).lower().strip() 

320 lang_code = m.group(2) 

321 greek_lang_name = code_to_name(lang_code, "el") 

322 english_lang_name = code_to_name(lang_code, "en") 

323 if not english_lang_name: 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true

324 wxr.wtp.warning( 

325 f"Invalid lang_code '{lang_code}'", sortid="page/43a" 

326 ) 

327 return lang_heading, "", False 

328 if not strip_accents(greek_lang_name).lower() == strip_accents( 328 ↛ 331line 328 didn't jump to line 331 because the condition on line 328 was never true

329 matched_name 

330 ): 

331 wxr.wtp.debug( 

332 f"Language code '{lang_code}' " 

333 f"Greek name '{greek_lang_name}' does not match " 

334 f"original string '{lang_heading}'; " 

335 f"outputting {english_lang_name}", 

336 sortid="page/45", 

337 ) 

338 return english_lang_name, lang_code, True