Coverage for src/wiktextract/extractor/el/etymology.py: 72%

37 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

2from wikitextprocessor.core import TemplateArgs 

3from wikitextprocessor.parser import LEVEL_KIND_FLAGS 

4 

5from wiktextract import WiktextractContext 

6from wiktextract.page import clean_node 

7 

8from .models import WordEntry 

9from .parse_utils import ( 

10 POSReturns, 

11 find_sections, 

12) 

13from .pos import extract_form_of_templates 

14from .pronunciation import process_pron 

15from .section_titles import POS_HEADINGS, Heading, Tags 

16from .text_utils import ENDING_NUMBER_RE 

17 

18 

19def process_etym( 

20 wxr: WiktextractContext, 

21 base_data: WordEntry, 

22 node: WikiNode, 

23 title: str, 

24 num: int, 

25) -> tuple[int, POSReturns]: 

26 """Extract etymological data from section and process POS children.""" 

27 # Get everything except subsections, which we assume are POS nodes. 

28 etym_contents = list(node.invert_find_child(LEVEL_KIND_FLAGS)) 

29 etym_sublevels = list(node.find_child(LEVEL_KIND_FLAGS)) 

30 ret_etym_sublevels: POSReturns = [] 

31 

32 wxr.wtp.start_subsection(title) 

33 

34 section_num = num 

35 

36 # Extract form_of data 

37 for i, t_node in enumerate(etym_contents): 

38 if isinstance(t_node, TemplateNode): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 extract_form_of_templates(wxr, base_data, t_node, etym_contents, i) 

40 if isinstance(t_node, WikiNode) and t_node.kind == NodeKind.LIST: 40 ↛ 37line 40 didn't jump to line 37 because the condition on line 40 was always true

41 for l_item in t_node.find_child_recursively(NodeKind.LIST_ITEM): 

42 for j, l_node in enumerate(l_item.children): 

43 if isinstance(l_node, TemplateNode): 

44 extract_form_of_templates( 

45 wxr, base_data, l_node, l_item.children, j 

46 ) 

47 

48 # Greek wiktionary doesn't seem to have etymology templates, or at 

49 # least they're not used as much. 

50 etym_text = ( 

51 clean_node(wxr, base_data, etym_contents) 

52 .lstrip(":#") 

53 .strip() 

54 ) 

55 

56 if etym_text: 56 ↛ 59line 56 didn't jump to line 59 because the condition on line 56 was always true

57 base_data.etymology_text = etym_text 

58 

59 for heading_type, pos, title, tags, num, subnode in find_sections( 59 ↛ 62line 59 didn't jump to line 62 because the loop on line 59 never started

60 wxr, etym_sublevels 

61 ): 

62 if heading_type == Heading.POS: 

63 section_num = num if num > section_num else section_num 

64 ret_etym_sublevels.append( 

65 (pos, title, tags, num, subnode, base_data.copy(deep=True)) 

66 ) 

67 elif heading_type == Heading.Pron: 

68 section_num = num if num > section_num else section_num 

69 

70 num, pron_sublevels = process_pron( 

71 wxr, subnode, base_data, title, section_num 

72 ) 

73 

74 ret_etym_sublevels.extend(pron_sublevels) 

75 

76 return section_num, ret_etym_sublevels