Coverage for src/wiktextract/extractor/el/etymology.py: 72%
37 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from wikitextprocessor import NodeKind, TemplateNode, WikiNode
2from wikitextprocessor.core import TemplateArgs
3from wikitextprocessor.parser import LEVEL_KIND_FLAGS
5from wiktextract import WiktextractContext
6from wiktextract.page import clean_node
8from .models import WordEntry
9from .parse_utils import (
10 POSReturns,
11 find_sections,
12)
13from .pos import extract_form_of_templates
14from .pronunciation import process_pron
15from .section_titles import POS_HEADINGS, Heading, Tags
16from .text_utils import ENDING_NUMBER_RE
19def process_etym(
20 wxr: WiktextractContext,
21 base_data: WordEntry,
22 node: WikiNode,
23 title: str,
24 num: int,
25) -> tuple[int, POSReturns]:
26 """Extract etymological data from section and process POS children."""
27 # Get everything except subsections, which we assume are POS nodes.
28 etym_contents = list(node.invert_find_child(LEVEL_KIND_FLAGS))
29 etym_sublevels = list(node.find_child(LEVEL_KIND_FLAGS))
30 ret_etym_sublevels: POSReturns = []
32 wxr.wtp.start_subsection(title)
34 section_num = num
36 # Extract form_of data
37 for i, t_node in enumerate(etym_contents):
38 if isinstance(t_node, TemplateNode): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 extract_form_of_templates(wxr, base_data, t_node, etym_contents, i)
40 if isinstance(t_node, WikiNode) and t_node.kind == NodeKind.LIST: 40 ↛ 37line 40 didn't jump to line 37 because the condition on line 40 was always true
41 for l_item in t_node.find_child_recursively(NodeKind.LIST_ITEM):
42 for j, l_node in enumerate(l_item.children):
43 if isinstance(l_node, TemplateNode):
44 extract_form_of_templates(
45 wxr, base_data, l_node, l_item.children, j
46 )
48 # Greek wiktionary doesn't seem to have etymology templates, or at
49 # least they're not used as much.
50 etym_text = (
51 clean_node(wxr, base_data, etym_contents)
52 .lstrip(":#")
53 .strip()
54 )
56 if etym_text: 56 ↛ 59line 56 didn't jump to line 59 because the condition on line 56 was always true
57 base_data.etymology_text = etym_text
59 for heading_type, pos, title, tags, num, subnode in find_sections( 59 ↛ 62line 59 didn't jump to line 62 because the loop on line 59 never started
60 wxr, etym_sublevels
61 ):
62 if heading_type == Heading.POS:
63 section_num = num if num > section_num else section_num
64 ret_etym_sublevels.append(
65 (pos, title, tags, num, subnode, base_data.copy(deep=True))
66 )
67 elif heading_type == Heading.Pron:
68 section_num = num if num > section_num else section_num
70 num, pron_sublevels = process_pron(
71 wxr, subnode, base_data, title, section_num
72 )
74 ret_etym_sublevels.extend(pron_sublevels)
76 return section_num, ret_etym_sublevels