Coverage for src/wiktextract/extractor/el/page.py: 63%
104 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import code_to_name, name_to_code
6# NodeKind is an internal enum for WikiNode and subclasses that specifies
7# what kind of WikiNode it is. Subclasses also have the field, but it's
8# always NodeKind.TEMPLATE for TemplateNodes etc.
9from wikitextprocessor.parser import LEVEL_KIND_FLAGS, NodeKind, WikiNode
11# Clean node takes a WikiNode+strings node or tree and gives you a cleanish text
12from wiktextract.page import clean_node, clean_value
14# The main context object to more easily share state of parsing between
15# functions. Contains WiktextractContext.wtp, which is the context for
16# wikitextprocessor and usually holds all the good stuff.
17from wiktextract.wxr_context import WiktextractContext
19# For debug printing when doing batches and log messages that don't make
20# sense as word-specific debug, warning or error messages (see those
21# in wikitextprocessor's context).
22from wiktextract.wxr_logging import logger
24from .etymology import process_etym
25from .models import WordEntry
26from .parse_utils import (
27 POSReturns,
28 find_sections,
29 parse_lower_heading,
30 strip_accents,
31)
32from .pos import process_pos
33from .pronunciation import process_pron
34from .section_titles import (
35 Heading,
36 Tags,
37)
39# from .text_utils import ENDING_NUMBER_RE
42def parse_page(
43 wxr: WiktextractContext, page_title: str, page_text: str
44) -> list[dict[str, Any]]:
45 """Parse Greek Wiktionary (el.wiktionary.org) page.
47 References:
48 * https://el.wiktionary.org/wiki/Βικιλεξικό:Δομή_λημμάτων
49 """
51 if wxr.config.verbose: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 logger.info(f"Parsing page: {page_title}")
54 wxr.config.word = page_title
55 wxr.wtp.start_page(page_title)
57 parts = []
58 parts.append(page_title)
60 # from .debug_bypass import debug_bypass
61 # return debug_bypass(wxr, page_title, page_text)
63 if page_title.startswith("Πύλη:"): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 return []
66 page_root = wxr.wtp.parse(
67 page_text,
68 )
70 # print_tree(page_root) # WikiNode tree pretty printer
71 word_datas: list[WordEntry] = []
73 # stuff_outside_main_headings = page_root.invert_find_child(
74 # LEVEL_KIND_FLAGS)
76 # Handle stuff at the very top of the page
77 # for thing_node in stuff_outside_main_headings:
78 # ...
80 previous_empty_language_name: str | None = None
81 previous_empty_language_code: str | None = None
83 for level in page_root.find_child(LEVEL_KIND_FLAGS):
84 # Contents of the heading itself; should be "Languagename".
85 # clean_node() is the general purpose WikiNode/string -> string
86 # implementation. Things like formatting are stripped; it mimics
87 # the output of wikitext when possible.
88 # == English == # <- This part
89 # === Noun ===
90 lang_name, lang_code, ok = parse_language_name(
91 wxr, clean_node(wxr, None, level.largs).strip()
92 )
94 section_num = -1
96 # print("=====")
97 # print(f"{level=}\n => {clean_node(wxr, None, level.largs).strip()}")
99 sublevels = list(level.find_child(LEVEL_KIND_FLAGS))
101 if not ok: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 if level.kind not in (NodeKind.LEVEL1, NodeKind.LEVEL2):
103 # We tried to parse a lower level as a language because it
104 # was a direct child of root and failed, so let's just ignore
105 # it and not print a warning.
106 continue
107 if (
108 previous_empty_language_name is None
109 or previous_empty_language_code is None
110 ):
111 wxr.wtp.warning(
112 f"Can't parse language header: '{lang_name}'; "
113 "skipping section",
114 sortid="page/111",
115 )
116 continue
117 lang_name = previous_empty_language_name
118 lang_code = previous_empty_language_code
119 sublevels = [level]
121 wxr.wtp.start_section(lang_name)
123 base_data = WordEntry(
124 word=page_title,
125 lang_code=lang_code,
126 lang=lang_name,
127 pos="ERROR_UNKNOWN_POS",
128 )
130 prev_data: WordEntry | None = None
132 if len(sublevels) == 0 and ok: 132 ↛ 135line 132 didn't jump to line 135 because the condition on line 132 was never true
133 # Someone messed up by putting a Level 1 directly after a language
134 # header.
135 previous_empty_language_name = lang_name
136 previous_empty_language_code = lang_code
137 continue
139 previous_empty_language_name = None
140 previous_empty_language_code = None
142 # XXX Some tables are put directly into the language level's content
143 # Separate content and sublevels, parse content and put in base_data
145 for sublevel in sublevels:
146 if len(sublevel.largs) == 0: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 wxr.wtp.debug(
148 f"Sublevel without .largs: {sublevel=}", sortid="page/92"
149 )
150 continue
152 heading_title = (
153 clean_node(wxr, None, sublevel.largs[0]).lower().strip("= \n")
154 )
156 type, pos, heading_name, tags, num, ok = parse_lower_heading(
157 wxr, heading_title
158 )
160 section_num = num if num > section_num else section_num
162 if not ok: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 wxr.wtp.warning(
164 f"Sub-language heading '{heading_title}' couldn't be "
165 f"be parsed as a heading; "
166 f"{type=}, {heading_name=}, {tags=}.",
167 sortid="page/103/20241112",
168 )
169 continue
171 if type in (Heading.Err, Heading.Ignored): 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true
172 continue
173 ## TEMP
175 found_pos_sections: POSReturns = []
177 if type is Heading.Etym:
178 # Update base_data with etymology and maybe sound data.
179 # Return any sublevels in the etymology section
180 # so that we can check for POS sections.
181 num, etym_sublevels = process_etym(
182 wxr, base_data, sublevel, heading_name, section_num
183 )
185 section_num = num if num > section_num else section_num
187 found_pos_sections.extend(etym_sublevels)
189 # ...
190 # text = clean_node(wxr, None, sublevel)
191 # text = wxr.wtp.node_to_wikitext(sublevel)
192 # if "\n=" in text:
193 # text = "£ " + "\n£ ".join(text.splitlines())
194 # logger.warning(f"£ {wxr.wtp.title}\n" + text)
196 # PRINTS HERE
198 # continue
200 ## /TEMP
202 # Typical pronunciation section that applies to the whole
203 # entry
204 if type == Heading.Pron: 204 ↛ 208line 204 didn't jump to line 208 because the condition on line 204 was never true
205 # Update base_data with sound and hyphenation data.
206 # Return any sublevels in the pronunciation section
207 # so that we can check for POS sections.
208 num, pron_sublevels = process_pron(
209 wxr, sublevel, base_data, heading_name, section_num
210 )
212 section_num = num if num > section_num else section_num
214 found_pos_sections.extend(pron_sublevels)
216 if type is Heading.POS:
217 found_pos_sections.append(
218 (
219 pos,
220 heading_name,
221 tags,
222 section_num,
223 sublevel,
224 base_data.model_copy(deep=True),
225 )
226 )
228 #################################################
229 # Finally handle all POS sections we've extracted
230 for (
231 pos,
232 title,
233 tags,
234 num,
235 pos_section,
236 pos_base_data,
237 ) in found_pos_sections:
238 if ( 238 ↛ 253line 238 didn't jump to line 253 because the condition on line 238 was always true
239 pos_ret := process_pos(
240 wxr,
241 pos_section,
242 pos_base_data.model_copy(deep=True),
243 prev_data,
244 pos, # heading_name is the English pos
245 title,
246 tags,
247 num,
248 )
249 ) is not None:
250 word_datas.append(pos_ret)
251 prev_data = pos_ret
252 else:
253 wxr.wtp.error(
254 f"Couldn't parse PoS section {pos}",
255 sortid="page.py/20250110",
256 )
258 # logger.info("%%" + "\n%%".join(parts))
259 # Transform pydantic objects to normal dicts so that the old code can
260 # handle them.
261 return [wd.model_dump(exclude_defaults=True) for wd in word_datas]
262 # return [base_data.model_dump(exclude_defaults=True)]
265LANGUAGE_HEADINGS_RE = re.compile(r"([\w\s]+)\(([-\w]+)\)")
267IRREGULAR_LANGUAGE_HEADINGS = {
268 "διαγλωσσικοί όροι": {"name": "Translingual", "code": "mul"},
269 "διεθνείς όροι": {"name": "Translingual", "code": "mul"},
270 "νέα ελληνικά (el)": {"code": "el"},
271 "μεσαιωνικά ελληνικά (gkm)": {"name": "Medieval Greek", "code": "gkm"},
272 "μεσαιωνικά ελληνικά": {"name": "Medieval Greek", "code": "gkm"},
273 "αρωμουνικά (βλάχικα) (roa-rup)": {"code": "roa-rup"},
274 "κρητικά (el-crt)": {"code": "el-crt", "name": "Cretan Greek"},
275 "κυπριακά (el-cyp)": {"code": "el-cyp", "name": "Cypriot Greek"},
276 "χαρακτήρας unicode": {"code": "mul", "name": "Translingual"},
277 # "": {"code": ""},
278}
281def parse_language_name(
282 wxr: WiktextractContext, lang_heading: str
283) -> tuple[str, str, bool]:
284 lang_heading = lang_heading.strip()
285 irregulars = IRREGULAR_LANGUAGE_HEADINGS.get(lang_heading.lower(), None)
286 if irregulars is not None:
287 return (
288 irregulars.get("name") or code_to_name(irregulars["code"], "en"),
289 irregulars["code"],
290 True,
291 )
293 m = LANGUAGE_HEADINGS_RE.match(lang_heading)
294 if m is None: 294 ↛ 306line 294 didn't jump to line 306 because the condition on line 294 was always true
295 lang_code = name_to_code(lang_heading, "el")
296 if not lang_code: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true
297 return lang_heading, "", False
298 english_lang_name = code_to_name(lang_code, "en")
299 if not english_lang_name: 299 ↛ 300line 299 didn't jump to line 300 because the condition on line 299 was never true
300 wxr.wtp.warning(
301 f"Invalid lang_code '{lang_code}'", sortid="page/194"
302 )
303 return lang_heading, "", False
304 return english_lang_name, lang_code, True
305 else:
306 matched_name = m.group(1).lower().strip()
307 lang_code = m.group(2)
308 greek_lang_name = code_to_name(lang_code, "el")
309 english_lang_name = code_to_name(lang_code, "en")
310 if not english_lang_name:
311 wxr.wtp.warning(
312 f"Invalid lang_code '{lang_code}'", sortid="page/43a"
313 )
314 return lang_heading, "", False
315 if not strip_accents(greek_lang_name).lower() == strip_accents(
316 matched_name
317 ):
318 wxr.wtp.debug(
319 f"Language code '{lang_code}' "
320 f"Greek name '{greek_lang_name}' does not match "
321 f"original string '{lang_heading}'; "
322 f"outputting {english_lang_name}",
323 sortid="page/45",
324 )
325 return english_lang_name, lang_code, True