Coverage for src/wiktextract/extractor/el/page.py: 58%
111 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
2from typing import Any, cast
4from mediawiki_langcodes import code_to_name, name_to_code
6# NodeKind is an internal enum for WikiNode and subclasses that specifies
7# what kind of WikiNode it is. Subclasses also have the field, but it's
8# always NodeKind.TEMPLATE for TemplateNodes etc.
9from wikitextprocessor import TemplateNode
10from wikitextprocessor.parser import LEVEL_KIND_FLAGS, NodeKind
12# Clean node takes a WikiNode+strings node or tree and gives you a cleanish text
13from wiktextract.extractor.el.table import process_inflection_section
14from wiktextract.page import clean_node
16# The main context object to more easily share state of parsing between
17# functions. Contains WiktextractContext.wtp, which is the context for
18# wikitextprocessor and usually holds all the good stuff.
19from wiktextract.wxr_context import WiktextractContext
21# For debug printing when doing batches and log messages that don't make
22# sense as word-specific debug, warning or error messages (see those
23# in wikitextprocessor's context).
24from wiktextract.wxr_logging import logger
26from .etymology import process_etym
27from .models import WordEntry
28from .parse_utils import (
29 POSReturns,
30 parse_lower_heading,
31 strip_accents,
32)
33from .pos import process_pos
34from .pronunciation import process_pron
35from .section_titles import Heading, POSName
37# from .text_utils import ENDING_NUMBER_RE
40def parse_page(
41 wxr: WiktextractContext, page_title: str, page_text: str
42) -> list[dict[str, Any]]:
43 """Parse Greek Wiktionary (el.wiktionary.org) page.
45 References:
46 * https://el.wiktionary.org/wiki/Βικιλεξικό:Δομή_λημμάτων
47 """
49 if wxr.config.verbose: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 logger.info(f"Parsing page: {page_title}")
52 wxr.config.word = page_title
53 wxr.wtp.start_page(page_title)
55 parts = []
56 parts.append(page_title)
58 # from .debug_bypass import debug_bypass
59 # return debug_bypass(wxr, page_title, page_text)
61 if page_title.startswith("Πύλη:"): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 return []
64 page_root = wxr.wtp.parse(
65 page_text,
66 )
68 # print_tree(page_root) # WikiNode tree pretty printer
69 word_datas: list[WordEntry] = []
71 # stuff_outside_main_headings = page_root.invert_find_child(
72 # LEVEL_KIND_FLAGS)
74 # Handle stuff at the very top of the page
75 # for thing_node in stuff_outside_main_headings:
76 # ...
78 previous_empty_language_name: str | None = None
79 previous_empty_language_code: str | None = None
81 for level in page_root.find_child(LEVEL_KIND_FLAGS):
82 # Contents of the heading itself; should be "Languagename".
83 # clean_node() is the general purpose WikiNode/string -> string
84 # implementation. Things like formatting are stripped; it mimics
85 # the output of wikitext when possible.
86 # == English == # <- This part
87 # === Noun ===
88 lang_name, lang_code, ok = parse_language_name(
89 wxr, clean_node(wxr, None, level.largs).strip()
90 )
92 section_num = -1
94 # print("=====")
95 # print(f"{level=}\n => {clean_node(wxr, None, level.largs).strip()}")
97 sublevels = list(level.find_child(LEVEL_KIND_FLAGS))
99 if not ok: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 if level.kind not in (NodeKind.LEVEL1, NodeKind.LEVEL2):
101 # We tried to parse a lower level as a language because it
102 # was a direct child of root and failed, so let's just ignore
103 # it and not print a warning.
104 continue
105 if (
106 previous_empty_language_name is None
107 or previous_empty_language_code is None
108 ):
109 wxr.wtp.wiki_notice(
110 f"Can't parse language header: '{lang_name}'; "
111 "skipping section",
112 sortid="page/111",
113 )
114 continue
115 lang_name = previous_empty_language_name
116 lang_code = previous_empty_language_code
117 sublevels = [level]
119 wxr.wtp.start_section(lang_name)
121 base_data = WordEntry(
122 word=page_title,
123 lang_code=lang_code,
124 lang=lang_name,
125 pos="ERROR_UNKNOWN_POS",
126 )
128 prev_data: WordEntry | None = None
130 if len(sublevels) == 0 and ok: 130 ↛ 133line 130 didn't jump to line 133 because the condition on line 130 was never true
131 # Someone messed up by putting a Level 1 directly after a language
132 # header.
133 previous_empty_language_name = lang_name
134 previous_empty_language_code = lang_code
135 continue
137 previous_empty_language_name = None
138 previous_empty_language_code = None
140 # Parse tables directly into the language level's content.
141 # Ex. from https://el.wiktionary.org/wiki/αμάξι
142 # =={{-el-}}==
143 # {{el-κλίση-'τραγούδι'}} <= THIS
144 # ...
145 #
146 # Notes:
147 # * Only support Modern Greek pages at the moment.
148 # * There can be more than one inflection: ρολόι, πλάγιος
149 if ( 149 ↛ 165line 149 didn't jump to line 165 because the condition on line 149 was always true
150 level.kind == NodeKind.LEVEL2
151 and level.largs
152 and clean_node(wxr, None, level.largs[0]) == "Νέα ελληνικά (el)"
153 ):
154 for child in level.children:
155 if isinstance(
156 child, TemplateNode
157 ) and child.template_name.startswith("el-κλίση"):
158 process_inflection_section(
159 wxr,
160 base_data,
161 child,
162 source="declension",
163 )
165 for sublevel in sublevels:
166 if len(sublevel.largs) == 0: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 wxr.wtp.debug(
168 f"Sublevel without .largs: {sublevel=}", sortid="page/92"
169 )
170 continue
172 heading_title = (
173 clean_node(wxr, None, sublevel.largs[0]).lower().strip("= \n")
174 )
176 heading_type, pos, tags, num, ok = parse_lower_heading(
177 wxr, heading_title
178 )
180 section_num = num if num > section_num else section_num
182 if not ok: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 wxr.wtp.wiki_notice(
184 f"Sub-language heading '{heading_title}' couldn't be "
185 f"be parsed as a heading; "
186 f"{heading_type=}, {heading_title=}, {tags=}.",
187 sortid="page/103/20241112",
188 )
189 continue
191 if heading_type in (Heading.Err, Heading.Ignored): 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true
192 continue
193 ## TEMP
195 found_pos_sections: POSReturns = []
197 if heading_type is Heading.Etym:
198 # Update base_data with etymology and maybe sound data.
199 # Return any sublevels in the etymology section
200 # so that we can check for POS sections.
201 num, etym_sublevels = process_etym(
202 wxr, base_data, sublevel, heading_title, section_num
203 )
205 section_num = num if num > section_num else section_num
207 found_pos_sections.extend(etym_sublevels)
209 # ...
210 # text = clean_node(wxr, None, sublevel)
211 # text = wxr.wtp.node_to_wikitext(sublevel)
212 # if "\n=" in text:
213 # text = "£ " + "\n£ ".join(text.splitlines())
214 # logger.warning(f"£ {wxr.wtp.title}\n" + text)
216 # PRINTS HERE
218 # continue
220 ## /TEMP
222 # Typical pronunciation section that applies to the whole
223 # entry
224 if heading_type == Heading.Pron: 224 ↛ 228line 224 didn't jump to line 228 because the condition on line 224 was never true
225 # Update base_data with sound and hyphenation data.
226 # Return any sublevels in the pronunciation section
227 # so that we can check for POS sections.
228 num, pron_sublevels = process_pron(
229 wxr, sublevel, base_data, heading_title, section_num
230 )
232 section_num = num if num > section_num else section_num
234 found_pos_sections.extend(pron_sublevels)
236 if heading_type == Heading.POS:
237 # SAFETY: Since the heading_type is POS, parse_lower_heading
238 # "pos_or_section" is guaranteed to be a pos: POSName
239 pos = cast(POSName, pos)
240 found_pos_sections.append(
241 (
242 pos,
243 heading_title,
244 tags,
245 section_num,
246 sublevel,
247 base_data.model_copy(deep=True),
248 )
249 )
251 #################################################
252 # Finally handle all POS sections we've extracted
253 for (
254 pos,
255 title,
256 tags,
257 num,
258 pos_section,
259 pos_base_data,
260 ) in found_pos_sections:
261 if ( 261 ↛ 276line 261 didn't jump to line 276 because the condition on line 261 was always true
262 pos_ret := process_pos(
263 wxr,
264 pos_section,
265 pos_base_data.model_copy(deep=True),
266 prev_data,
267 pos,
268 title,
269 tags,
270 num,
271 )
272 ) is not None:
273 word_datas.append(pos_ret)
274 prev_data = pos_ret
275 else:
276 wxr.wtp.error(
277 f"Couldn't parse PoS section {pos}",
278 sortid="page.py/20250110",
279 )
281 # logger.info("%%" + "\n%%".join(parts))
282 # Transform pydantic objects to normal dicts so that the old code can
283 # handle them.
284 return [wd.model_dump(exclude_defaults=True) for wd in word_datas]
285 # return [base_data.model_dump(exclude_defaults=True)]
288LANGUAGE_HEADINGS_RE = re.compile(r"([\w\s]+)\(([-\w]+)\)")
290IRREGULAR_LANGUAGE_HEADINGS = {
291 "διαγλωσσικοί όροι": {"name": "Translingual", "code": "mul"},
292 "διεθνείς όροι": {"name": "Translingual", "code": "mul"},
293 "νέα ελληνικά (el)": {"code": "el"},
294 "μεσαιωνικά ελληνικά (gkm)": {"name": "Medieval Greek", "code": "gkm"},
295 "μεσαιωνικά ελληνικά": {"name": "Medieval Greek", "code": "gkm"},
296 "αρωμουνικά (βλάχικα) (roa-rup)": {"code": "roa-rup"},
297 "κρητικά (el-crt)": {"code": "el-crt", "name": "Cretan Greek"},
298 "κυπριακά (el-cyp)": {"code": "el-cyp", "name": "Cypriot Greek"},
299 "χαρακτήρας unicode": {"code": "mul", "name": "Translingual"},
300 # "": {"code": ""},
301}
304def parse_language_name(
305 wxr: WiktextractContext, lang_heading: str
306) -> tuple[str, str, bool]:
307 lang_heading = lang_heading.strip()
308 irregulars = IRREGULAR_LANGUAGE_HEADINGS.get(lang_heading.lower(), None)
309 if irregulars is not None: 309 ↛ 316line 309 didn't jump to line 316 because the condition on line 309 was always true
310 return (
311 irregulars.get("name") or code_to_name(irregulars["code"], "en"),
312 irregulars["code"],
313 True,
314 )
316 m = LANGUAGE_HEADINGS_RE.match(lang_heading)
317 if m is None:
318 lang_code = name_to_code(lang_heading, "el")
319 if not lang_code:
320 return lang_heading, "", False
321 english_lang_name = code_to_name(lang_code, "en")
322 if not english_lang_name:
323 wxr.wtp.warning(
324 f"Invalid lang_code '{lang_code}'", sortid="page/194"
325 )
326 return lang_heading, "", False
327 return english_lang_name, lang_code, True
328 else:
329 matched_name = m.group(1).lower().strip()
330 lang_code = m.group(2)
331 greek_lang_name = code_to_name(lang_code, "el")
332 english_lang_name = code_to_name(lang_code, "en")
333 if not english_lang_name:
334 wxr.wtp.warning(
335 f"Invalid lang_code '{lang_code}'", sortid="page/43a"
336 )
337 return lang_heading, "", False
338 if not strip_accents(greek_lang_name).lower() == strip_accents(
339 matched_name
340 ):
341 wxr.wtp.debug(
342 f"Language code '{lang_code}' "
343 f"Greek name '{greek_lang_name}' does not match "
344 f"original string '{lang_heading}'; "
345 f"outputting {english_lang_name}",
346 sortid="page/45",
347 )
348 return english_lang_name, lang_code, True