Coverage for src/wiktextract/extractor/el/etymology.py: 70%

1from typing import cast

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode

4from wikitextprocessor.parser import LEVEL_KIND_FLAGS

6from wiktextract import WiktextractContext

7from wiktextract.page import clean_node

9from .models import WordEntry

10from .parse_utils import (

11 POSReturns,

12 find_sections,

13)

14from .pos import extract_form_of_templates

15from .pronunciation import process_pron

16from .section_titles import Heading, POSName

19def process_etym(

20 wxr: WiktextractContext,

21 base_data: WordEntry,

22 node: WikiNode,

23 title: str,

24 num: int,

25) -> tuple[int, POSReturns]:

26 """Extract etymological data from section and process POS children."""

27 # Get everything except subsections, which we assume are POS nodes.

28 etym_contents = list(node.invert_find_child(LEVEL_KIND_FLAGS))

29 etym_sublevels = list(node.find_child(LEVEL_KIND_FLAGS))

30 ret_etym_sublevels: POSReturns = []

32 wxr.wtp.start_subsection(title)

34 section_num = num

36 # Extract form_of data

37 for i, t_node in enumerate(etym_contents):

38 if isinstance(t_node, TemplateNode): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 extract_form_of_templates(wxr, base_data, t_node, etym_contents, i)

40 if isinstance(t_node, WikiNode) and t_node.kind == NodeKind.LIST: 40 ↛ 37line 40 didn't jump to line 37 because the condition on line 40 was always true

41 for l_item in t_node.find_child_recursively(NodeKind.LIST_ITEM):

42 for j, l_node in enumerate(l_item.children):

43 if isinstance(l_node, TemplateNode):

44 extract_form_of_templates(

45 wxr, base_data, l_node, l_item.children, j

46 )

48 # Greek wiktionary doesn't seem to have etymology templates, or at

49 # least they're not used as much.

50 etym_text = clean_node(wxr, base_data, etym_contents).lstrip(":#").strip()

52 if etym_text: 52 ↛ 55line 52 didn't jump to line 55 because the condition on line 52 was always true

53 base_data.etymology_text = etym_text

55 for heading_type, pos, title, tags, num, subnode in find_sections( 55 ↛ 58line 55 didn't jump to line 58 because the loop on line 55 never started

56 wxr, etym_sublevels

57 ):

58 if heading_type == Heading.POS:

59 section_num = num if num > section_num else section_num

60 # SAFETY: Since the heading_type is POS, find_sections

61 # "pos_or_section" is guaranteed to be a pos: POSName

62 pos = cast(POSName, pos)

63 ret_etym_sublevels.append(

64 (

65 pos,

66 title,

67 tags,

68 num,

69 subnode,

70 base_data.model_copy(deep=True),

71 )

72 )

73 elif heading_type == Heading.Pron:

74 section_num = num if num > section_num else section_num

76 num, pron_sublevels = process_pron(

77 wxr, subnode, base_data, title, section_num

78 )

80 ret_etym_sublevels.extend(pron_sublevels)

82 return section_num, ret_etym_sublevels