Coverage for src/wiktextract/extractor/el/page.py: 58%

1import re

2from typing import Any, cast

4from mediawiki_langcodes import code_to_name, name_to_code

6# NodeKind is an internal enum for WikiNode and subclasses that specifies

7# what kind of WikiNode it is. Subclasses also have the field, but it's

8# always NodeKind.TEMPLATE for TemplateNodes etc.

9from wikitextprocessor import TemplateNode

10from wikitextprocessor.parser import LEVEL_KIND_FLAGS, NodeKind

12# Clean node takes a WikiNode+strings node or tree and gives you a cleanish text

13from wiktextract.extractor.el.table import process_inflection_section

14from wiktextract.page import clean_node

16# The main context object to more easily share state of parsing between

17# functions. Contains WiktextractContext.wtp, which is the context for

18# wikitextprocessor and usually holds all the good stuff.

19from wiktextract.wxr_context import WiktextractContext

21# For debug printing when doing batches and log messages that don't make

22# sense as word-specific debug, warning or error messages (see those

23# in wikitextprocessor's context).

24from wiktextract.wxr_logging import logger

26from .etymology import process_etym

27from .models import WordEntry

28from .parse_utils import (

29 POSReturns,

30 parse_lower_heading,

31 strip_accents,

32)

33from .pos import process_pos

34from .pronunciation import process_pron

35from .section_titles import Heading, POSName

37# from .text_utils import ENDING_NUMBER_RE

40def parse_page(

41 wxr: WiktextractContext, page_title: str, page_text: str

42) -> list[dict[str, Any]]:

43 """Parse Greek Wiktionary (el.wiktionary.org) page.

45 References:

46 * https://el.wiktionary.org/wiki/Βικιλεξικό:Δομή_λημμάτων

47 """

49 if wxr.config.verbose: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 logger.info(f"Parsing page: {page_title}")

52 wxr.config.word = page_title

53 wxr.wtp.start_page(page_title)

55 parts = []

56 parts.append(page_title)

58 # from .debug_bypass import debug_bypass

59 # return debug_bypass(wxr, page_title, page_text)

61 if page_title.startswith("Πύλη:"): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 return []

64 page_root = wxr.wtp.parse(

65 page_text,

66 )

68 # print_tree(page_root) # WikiNode tree pretty printer

69 word_datas: list[WordEntry] = []

71 # stuff_outside_main_headings = page_root.invert_find_child(

72 # LEVEL_KIND_FLAGS)

74 # Handle stuff at the very top of the page

75 # for thing_node in stuff_outside_main_headings:

76 # ...

78 previous_empty_language_name: str | None = None

79 previous_empty_language_code: str | None = None

81 for level in page_root.find_child(LEVEL_KIND_FLAGS):

82 # Contents of the heading itself; should be "Languagename".

83 # clean_node() is the general purpose WikiNode/string -> string

84 # implementation. Things like formatting are stripped; it mimics

85 # the output of wikitext when possible.

86 # == English == # <- This part

87 # === Noun ===

88 lang_name, lang_code, ok = parse_language_name(

89 wxr, clean_node(wxr, None, level.largs).strip()

90 )

92 section_num = -1

94 # print("=====")

95 # print(f"{level=}\n => {clean_node(wxr, None, level.largs).strip()}")

97 sublevels = list(level.find_child(LEVEL_KIND_FLAGS))

99 if not ok: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 if level.kind not in (NodeKind.LEVEL1, NodeKind.LEVEL2):

101 # We tried to parse a lower level as a language because it

102 # was a direct child of root and failed, so let's just ignore

103 # it and not print a warning.

104 continue

105 if (

106 previous_empty_language_name is None

107 or previous_empty_language_code is None

108 ):

109 wxr.wtp.wiki_notice(

110 f"Can't parse language header: '{lang_name}'; "

111 "skipping section",

112 sortid="page/111",

113 )

114 continue

115 lang_name = previous_empty_language_name

116 lang_code = previous_empty_language_code

117 sublevels = [level]

118

119 wxr.wtp.start_section(lang_name)

120

121 base_data = WordEntry(

122 word=page_title,

123 lang_code=lang_code,

124 lang=lang_name,

125 pos="ERROR_UNKNOWN_POS",

126 )

127

128 prev_data: WordEntry | None = None

129

130 if len(sublevels) == 0 and ok: 130 ↛ 133line 130 didn't jump to line 133 because the condition on line 130 was never true

131 # Someone messed up by putting a Level 1 directly after a language

132 # header.

133 previous_empty_language_name = lang_name

134 previous_empty_language_code = lang_code

135 continue

136

137 previous_empty_language_name = None

138 previous_empty_language_code = None

139

140 # Parse tables directly into the language level's content.

141 # Ex. from https://el.wiktionary.org/wiki/αμάξι

142 # =={{-el-}}==

143 # {{el-κλίση-'τραγούδι'}} <= THIS

144 # ...

145 #

146 # Notes:

147 # * Only support Modern Greek pages at the moment.

148 # * There can be more than one inflection: ρολόι, πλάγιος

149 if ( 149 ↛ 165line 149 didn't jump to line 165 because the condition on line 149 was always true

150 level.kind == NodeKind.LEVEL2

151 and level.largs

152 and clean_node(wxr, None, level.largs[0]) == "Νέα ελληνικά (el)"

153 ):

154 for child in level.children:

155 if isinstance(

156 child, TemplateNode

157 ) and child.template_name.startswith("el-κλίση"):

158 process_inflection_section(

159 wxr,

160 base_data,

161 child,

162 source="declension",

163 )

164

165 for sublevel in sublevels:

166 if len(sublevel.largs) == 0: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 wxr.wtp.debug(

168 f"Sublevel without .largs: {sublevel=}", sortid="page/92"

169 )

170 continue

171

172 heading_title = (

173 clean_node(wxr, None, sublevel.largs[0]).lower().strip("= \n")

174 )

175

176 heading_type, pos, tags, num, ok = parse_lower_heading(

177 wxr, heading_title

178 )

179

180 section_num = num if num > section_num else section_num

181

182 if not ok: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 wxr.wtp.wiki_notice(

184 f"Sub-language heading '{heading_title}' couldn't be "

185 f"be parsed as a heading; "

186 f"{heading_type=}, {heading_title=}, {tags=}.",

187 sortid="page/103/20241112",

188 )

189 continue

190

191 if heading_type in (Heading.Err, Heading.Ignored): 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true

192 continue

193 ## TEMP

194

195 found_pos_sections: POSReturns = []

196

197 if heading_type is Heading.Etym:

198 # Update base_data with etymology and maybe sound data.

199 # Return any sublevels in the etymology section

200 # so that we can check for POS sections.

201 num, etym_sublevels = process_etym(

202 wxr, base_data, sublevel, heading_title, section_num

203 )

204

205 section_num = num if num > section_num else section_num

206

207 found_pos_sections.extend(etym_sublevels)

208

209 # ...

210 # text = clean_node(wxr, None, sublevel)

211 # text = wxr.wtp.node_to_wikitext(sublevel)

212 # if "\n=" in text:

213 # text = "£ " + "\n£ ".join(text.splitlines())

214 # logger.warning(f"£ {wxr.wtp.title}\n" + text)

216 # PRINTS HERE

218 # continue

220 ## /TEMP

222 # Typical pronunciation section that applies to the whole

223 # entry

224 if heading_type == Heading.Pron: 224 ↛ 228line 224 didn't jump to line 228 because the condition on line 224 was never true

225 # Update base_data with sound and hyphenation data.

226 # Return any sublevels in the pronunciation section

227 # so that we can check for POS sections.

228 num, pron_sublevels = process_pron(

229 wxr, sublevel, base_data, heading_title, section_num

230 )

231

232 section_num = num if num > section_num else section_num

233

234 found_pos_sections.extend(pron_sublevels)

235

236 if heading_type == Heading.POS:

237 # SAFETY: Since the heading_type is POS, parse_lower_heading

238 # "pos_or_section" is guaranteed to be a pos: POSName

239 pos = cast(POSName, pos)

240 found_pos_sections.append(

241 (

242 pos,

243 heading_title,

244 tags,

245 section_num,

246 sublevel,

247 base_data.model_copy(deep=True),

248 )

249 )

250

251 #################################################

252 # Finally handle all POS sections we've extracted

253 for (

254 pos,

255 title,

256 tags,

257 num,

258 pos_section,

259 pos_base_data,

260 ) in found_pos_sections:

261 if ( 261 ↛ 276line 261 didn't jump to line 276 because the condition on line 261 was always true

262 pos_ret := process_pos(

263 wxr,

264 pos_section,

265 pos_base_data.model_copy(deep=True),

266 prev_data,

267 pos,

268 title,

269 tags,

270 num,

271 )

272 ) is not None:

273 word_datas.append(pos_ret)

274 prev_data = pos_ret

275 else:

276 wxr.wtp.error(

277 f"Couldn't parse PoS section {pos}",

278 sortid="page.py/20250110",

279 )

280

281 # logger.info("%%" + "\n%%".join(parts))

282 # Transform pydantic objects to normal dicts so that the old code can

283 # handle them.

284 return [wd.model_dump(exclude_defaults=True) for wd in word_datas]

285 # return [base_data.model_dump(exclude_defaults=True)]

286

287

288LANGUAGE_HEADINGS_RE = re.compile(r"([\w\s]+)\(([-\w]+)\)")

289

290IRREGULAR_LANGUAGE_HEADINGS = {

291 "διαγλωσσικοί όροι": {"name": "Translingual", "code": "mul"},

292 "διεθνείς όροι": {"name": "Translingual", "code": "mul"},

293 "νέα ελληνικά (el)": {"code": "el"},

294 "μεσαιωνικά ελληνικά (gkm)": {"name": "Medieval Greek", "code": "gkm"},

295 "μεσαιωνικά ελληνικά": {"name": "Medieval Greek", "code": "gkm"},

296 "αρωμουνικά (βλάχικα) (roa-rup)": {"code": "roa-rup"},

297 "κρητικά (el-crt)": {"code": "el-crt", "name": "Cretan Greek"},

298 "κυπριακά (el-cyp)": {"code": "el-cyp", "name": "Cypriot Greek"},

299 "χαρακτήρας unicode": {"code": "mul", "name": "Translingual"},

300 # "": {"code": ""},

301}

302

303

304def parse_language_name(

305 wxr: WiktextractContext, lang_heading: str

306) -> tuple[str, str, bool]:

307 lang_heading = lang_heading.strip()

308 irregulars = IRREGULAR_LANGUAGE_HEADINGS.get(lang_heading.lower(), None)

309 if irregulars is not None: 309 ↛ 316line 309 didn't jump to line 316 because the condition on line 309 was always true

310 return (

311 irregulars.get("name") or code_to_name(irregulars["code"], "en"),

312 irregulars["code"],

313 True,

314 )

315

316 m = LANGUAGE_HEADINGS_RE.match(lang_heading)

317 if m is None:

318 lang_code = name_to_code(lang_heading, "el")

319 if not lang_code:

320 return lang_heading, "", False

321 english_lang_name = code_to_name(lang_code, "en")

322 if not english_lang_name:

323 wxr.wtp.warning(

324 f"Invalid lang_code '{lang_code}'", sortid="page/194"

325 )

326 return lang_heading, "", False

327 return english_lang_name, lang_code, True

328 else:

329 matched_name = m.group(1).lower().strip()

330 lang_code = m.group(2)

331 greek_lang_name = code_to_name(lang_code, "el")

332 english_lang_name = code_to_name(lang_code, "en")

333 if not english_lang_name:

334 wxr.wtp.warning(

335 f"Invalid lang_code '{lang_code}'", sortid="page/43a"

336 )

337 return lang_heading, "", False

338 if not strip_accents(greek_lang_name).lower() == strip_accents(

339 matched_name

340 ):

341 wxr.wtp.debug(

342 f"Language code '{lang_code}' "

343 f"Greek name '{greek_lang_name}' does not match "

344 f"original string '{lang_heading}'; "

345 f"outputting {english_lang_name}",

346 sortid="page/45",

347 )

348 return english_lang_name, lang_code, True