Coverage for src/wiktextract/extractor/el/etymology.py: 70%

37 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1from typing import cast 

2 

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

4from wikitextprocessor.parser import LEVEL_KIND_FLAGS 

5 

6from wiktextract import WiktextractContext 

7from wiktextract.page import clean_node 

8 

9from .models import WordEntry 

10from .parse_utils import ( 

11 POSReturns, 

12 find_sections, 

13) 

14from .pos import extract_form_of_templates 

15from .pronunciation import process_pron 

16from .section_titles import Heading, POSName 

17 

18 

19def process_etym( 

20 wxr: WiktextractContext, 

21 base_data: WordEntry, 

22 node: WikiNode, 

23 title: str, 

24 num: int, 

25) -> tuple[int, POSReturns]: 

26 """Extract etymological data from section and process POS children.""" 

27 # Get everything except subsections, which we assume are POS nodes. 

28 etym_contents = list(node.invert_find_child(LEVEL_KIND_FLAGS)) 

29 etym_sublevels = list(node.find_child(LEVEL_KIND_FLAGS)) 

30 ret_etym_sublevels: POSReturns = [] 

31 

32 wxr.wtp.start_subsection(title) 

33 

34 section_num = num 

35 

36 # Extract form_of data 

37 for i, t_node in enumerate(etym_contents): 

38 if isinstance(t_node, TemplateNode): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 extract_form_of_templates(wxr, base_data, t_node, etym_contents, i) 

40 if isinstance(t_node, WikiNode) and t_node.kind == NodeKind.LIST: 40 ↛ 37line 40 didn't jump to line 37 because the condition on line 40 was always true

41 for l_item in t_node.find_child_recursively(NodeKind.LIST_ITEM): 

42 for j, l_node in enumerate(l_item.children): 

43 if isinstance(l_node, TemplateNode): 

44 extract_form_of_templates( 

45 wxr, base_data, l_node, l_item.children, j 

46 ) 

47 

48 # Greek wiktionary doesn't seem to have etymology templates, or at 

49 # least they're not used as much. 

50 etym_text = clean_node(wxr, base_data, etym_contents).lstrip(":#").strip() 

51 

52 if etym_text: 52 ↛ 55line 52 didn't jump to line 55 because the condition on line 52 was always true

53 base_data.etymology_text = etym_text 

54 

55 for heading_type, pos, title, tags, num, subnode in find_sections( 55 ↛ 58line 55 didn't jump to line 58 because the loop on line 55 never started

56 wxr, etym_sublevels 

57 ): 

58 if heading_type == Heading.POS: 

59 section_num = num if num > section_num else section_num 

60 # SAFETY: Since the heading_type is POS, find_sections 

61 # "pos_or_section" is guaranteed to be a pos: POSName 

62 pos = cast(POSName, pos) 

63 ret_etym_sublevels.append( 

64 ( 

65 pos, 

66 title, 

67 tags, 

68 num, 

69 subnode, 

70 base_data.model_copy(deep=True), 

71 ) 

72 ) 

73 elif heading_type == Heading.Pron: 

74 section_num = num if num > section_num else section_num 

75 

76 num, pron_sublevels = process_pron( 

77 wxr, subnode, base_data, title, section_num 

78 ) 

79 

80 ret_etym_sublevels.extend(pron_sublevels) 

81 

82 return section_num, ret_etym_sublevels