Coverage for src / wiktextract / extractor / el / page.py: 67%
109 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from typing import Any, cast
4from mediawiki_langcodes import code_to_name, name_to_code
6# NodeKind is an internal enum for WikiNode and subclasses that specifies
7# what kind of WikiNode it is. Subclasses also have the field, but it's
8# always NodeKind.TEMPLATE for TemplateNodes etc.
9from wikitextprocessor import TemplateNode
10from wikitextprocessor.parser import LEVEL_KIND_FLAGS, NodeKind
12# Clean node takes a WikiNode+strings node or tree and gives you a cleanish text
13from wiktextract.extractor.el.table import process_inflection_section
14from wiktextract.page import clean_node
16# The main context object to more easily share state of parsing between
17# functions. Contains WiktextractContext.wtp, which is the context for
18# wikitextprocessor and usually holds all the good stuff.
19from wiktextract.wxr_context import WiktextractContext
21# For debug printing when doing batches and log messages that don't make
22# sense as word-specific debug, warning or error messages (see those
23# in wikitextprocessor's context).
24from wiktextract.wxr_logging import logger
26from .etymology import process_etym
27from .models import WordEntry
28from .parse_utils import (
29 POSReturns,
30 parse_lower_heading,
31 strip_accents,
32)
33from .pos import process_pos
34from .pronunciation import process_pron
35from .section_titles import Heading, POSName
38def parse_page(
39 wxr: WiktextractContext, page_title: str, page_text: str
40) -> list[dict[str, Any]]:
41 """Parse Greek Wiktionary (el.wiktionary.org) page.
43 References:
44 * https://el.wiktionary.org/wiki/Βικιλεξικό:Δομή_λημμάτων
45 """
47 if wxr.config.verbose: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 logger.info(f"Parsing page: {page_title}")
50 wxr.config.word = page_title
51 wxr.wtp.start_page(page_title)
53 # from .debug_bypass import debug_bypass
54 # return debug_bypass(wxr, page_title, page_text)
56 if page_title.startswith("Πύλη:"): 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 return []
59 page_root = wxr.wtp.parse(page_text)
61 # print_tree(page_root) # WikiNode tree pretty printer
62 word_data: list[WordEntry] = []
64 # stuff_outside_main_headings = page_root.invert_find_child(
65 # LEVEL_KIND_FLAGS)
67 # Handle stuff at the very top of the page
68 # for thing_node in stuff_outside_main_headings:
69 # ...
71 previous_empty_language_name: str | None = None
72 previous_empty_language_code: str | None = None
74 for level in page_root.find_child(LEVEL_KIND_FLAGS):
75 # Contents of the heading itself; should be "Languagename".
76 # clean_node() is the general purpose WikiNode/string -> string
77 # implementation. Things like formatting are stripped; it mimics
78 # the output of wikitext when possible.
79 # == English == # <- This part
80 # === Noun ===
81 lang_name, lang_code, ok = parse_language_name(
82 wxr, clean_node(wxr, None, level.largs).strip()
83 )
85 section_num = -1
87 # print("=====")
88 # print(f"{level=}\n => {clean_node(wxr, None, level.largs).strip()}")
90 sublevels = list(level.find_child(LEVEL_KIND_FLAGS))
92 if not ok: 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true
93 if level.kind not in (NodeKind.LEVEL1, NodeKind.LEVEL2):
94 # We tried to parse a lower level as a language because it
95 # was a direct child of root and failed, so let's just ignore
96 # it and not print a warning.
97 continue
98 if (
99 previous_empty_language_name is None
100 or previous_empty_language_code is None
101 ):
102 wxr.wtp.wiki_notice(
103 f"Can't parse language header: '{lang_name}'; "
104 "skipping section",
105 sortid="page/111",
106 )
107 continue
108 lang_name = previous_empty_language_name
109 lang_code = previous_empty_language_code
110 sublevels = [level]
112 wxr.wtp.start_section(lang_name)
114 base_data = WordEntry(
115 word=page_title,
116 lang_code=lang_code,
117 lang=lang_name,
118 pos="ERROR_UNKNOWN_POS",
119 )
121 prev_data: WordEntry | None = None
123 if len(sublevels) == 0 and ok: 123 ↛ 126line 123 didn't jump to line 126 because the condition on line 123 was never true
124 # Someone messed up by putting a Level 1 directly after a language
125 # header.
126 previous_empty_language_name = lang_name
127 previous_empty_language_code = lang_code
128 continue
130 previous_empty_language_name = None
131 previous_empty_language_code = None
133 # Parse tables directly into the language level's content.
134 # Ex. from https://el.wiktionary.org/wiki/αμάξι
135 # =={{-el-}}==
136 # {{el-κλίση-'τραγούδι'}} <= THIS
137 # ...
138 #
139 # Notes:
140 # * Only support Modern Greek pages at the moment.
141 # * There can be more than one inflection: ρολόι, πλάγιος
142 if (
143 level.kind == NodeKind.LEVEL2
144 and level.largs
145 and clean_node(wxr, None, level.largs[0]) == "Νέα ελληνικά (el)"
146 ):
147 for child in level.children:
148 if isinstance(
149 child, TemplateNode
150 ) and child.template_name.startswith("el-κλίση"):
151 process_inflection_section(
152 wxr,
153 base_data,
154 child,
155 source="declension",
156 )
158 for sublevel in sublevels:
159 if len(sublevel.largs) == 0: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 wxr.wtp.debug(
161 f"Sublevel without .largs: {sublevel=}", sortid="page/92"
162 )
163 continue
165 heading_title = (
166 clean_node(wxr, None, sublevel.largs[0]).lower().strip("= \n")
167 )
169 heading_type, pos, tags, num, ok = parse_lower_heading(
170 wxr, heading_title
171 )
173 section_num = num if num > section_num else section_num
175 if not ok: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true
176 wxr.wtp.wiki_notice(
177 f"Sub-language heading '{heading_title}' couldn't be "
178 f"be parsed as a heading; "
179 f"{heading_type=}, {heading_title=}, {tags=}.",
180 sortid="page/103/20241112",
181 )
182 continue
184 if heading_type in (Heading.Err, Heading.Ignored): 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true
185 continue
186 ## TEMP
188 found_pos_sections: POSReturns = []
190 if heading_type == Heading.Etym:
191 # Update base_data with etymology and maybe sound data.
192 # Return any sublevels in the etymology section
193 # so that we can check for POS sections.
194 num, etym_sublevels = process_etym(
195 wxr, base_data, sublevel, heading_title, section_num
196 )
198 section_num = num if num > section_num else section_num
200 found_pos_sections.extend(etym_sublevels)
202 # ...
203 # text = clean_node(wxr, None, sublevel)
204 # text = wxr.wtp.node_to_wikitext(sublevel)
205 # if "\n=" in text:
206 # text = "£ " + "\n£ ".join(text.splitlines())
207 # logger.warning(f"£ {wxr.wtp.title}\n" + text)
209 # PRINTS HERE
211 ## /TEMP
213 # Typical pronunciation section that applies to the whole
214 # entry
215 if heading_type == Heading.Pron: 215 ↛ 219line 215 didn't jump to line 219 because the condition on line 215 was never true
216 # Update base_data with sound and hyphenation data.
217 # Return any sublevels in the pronunciation section
218 # so that we can check for POS sections.
219 num, pron_sublevels = process_pron(
220 wxr, sublevel, base_data, heading_title, section_num
221 )
223 section_num = num if num > section_num else section_num
225 found_pos_sections.extend(pron_sublevels)
227 if heading_type == Heading.POS:
228 # SAFETY: Since the heading_type is POS, parse_lower_heading
229 # "pos_or_section" is guaranteed to be a pos: POSName
230 pos = cast(POSName, pos)
231 found_pos_sections.append(
232 (
233 pos,
234 heading_title,
235 tags,
236 section_num,
237 sublevel,
238 base_data.model_copy(deep=True),
239 )
240 )
242 #################################################
243 # Finally handle all POS sections we've extracted
244 for (
245 pos,
246 title,
247 tags,
248 num,
249 pos_section,
250 pos_base_data,
251 ) in found_pos_sections:
252 if ( 252 ↛ 267line 252 didn't jump to line 267 because the condition on line 252 was always true
253 pos_ret := process_pos(
254 wxr,
255 pos_section,
256 pos_base_data.model_copy(deep=True),
257 prev_data,
258 pos,
259 title,
260 tags,
261 num,
262 )
263 ) is not None:
264 word_data.append(pos_ret)
265 prev_data = pos_ret
266 else:
267 wxr.wtp.error(
268 f"Couldn't parse PoS section {pos}",
269 sortid="page.py/20250110",
270 )
272 # Transform pydantic objects to normal dicts so that the old code can
273 # handle them.
274 return [wd.model_dump(exclude_defaults=True) for wd in word_data]
275 # return [base_data.model_dump(exclude_defaults=True)]
278LANGUAGE_HEADINGS_RE = re.compile(r"([\w\s]+)\(([-\w]+)\)")
280IRREGULAR_LANGUAGE_HEADINGS = {
281 "διαγλωσσικοί όροι": {"name": "Translingual", "code": "mul"},
282 "διεθνείς όροι": {"name": "Translingual", "code": "mul"},
283 "νέα ελληνικά (el)": {"code": "el"},
284 "μεσαιωνικά ελληνικά (gkm)": {"name": "Medieval Greek", "code": "gkm"},
285 "μεσαιωνικά ελληνικά": {"name": "Medieval Greek", "code": "gkm"},
286 "αρωμουνικά (βλάχικα) (roa-rup)": {"code": "roa-rup"},
287 "κρητικά (el-crt)": {"code": "el-crt", "name": "Cretan Greek"},
288 "κυπριακά (el-cyp)": {"code": "el-cyp", "name": "Cypriot Greek"},
289 "χαρακτήρας unicode": {"code": "mul", "name": "Translingual"},
290 # "": {"code": ""},
291}
294def parse_language_name(
295 wxr: WiktextractContext, lang_heading: str
296) -> tuple[str, str, bool]:
297 lang_heading = lang_heading.strip()
298 irregulars = IRREGULAR_LANGUAGE_HEADINGS.get(lang_heading.lower(), None)
299 if irregulars is not None:
300 return (
301 irregulars.get("name") or code_to_name(irregulars["code"], "en"),
302 irregulars["code"],
303 True,
304 )
306 m = LANGUAGE_HEADINGS_RE.match(lang_heading)
307 if m is None: 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true
308 lang_code = name_to_code(lang_heading, "el")
309 if not lang_code:
310 return lang_heading, "", False
311 english_lang_name = code_to_name(lang_code, "en")
312 if not english_lang_name:
313 wxr.wtp.warning(
314 f"Invalid lang_code '{lang_code}'", sortid="page/194"
315 )
316 return lang_heading, "", False
317 return english_lang_name, lang_code, True
318 else:
319 matched_name = m.group(1).lower().strip()
320 lang_code = m.group(2)
321 greek_lang_name = code_to_name(lang_code, "el")
322 english_lang_name = code_to_name(lang_code, "en")
323 if not english_lang_name: 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true
324 wxr.wtp.warning(
325 f"Invalid lang_code '{lang_code}'", sortid="page/43a"
326 )
327 return lang_heading, "", False
328 if not strip_accents(greek_lang_name).lower() == strip_accents( 328 ↛ 331line 328 didn't jump to line 331 because the condition on line 328 was never true
329 matched_name
330 ):
331 wxr.wtp.debug(
332 f"Language code '{lang_code}' "
333 f"Greek name '{greek_lang_name}' does not match "
334 f"original string '{lang_heading}'; "
335 f"outputting {english_lang_name}",
336 sortid="page/45",
337 )
338 return english_lang_name, lang_code, True