Coverage for src / wiktextract / extractor / pl / etymology.py: 82%

37 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor.parser import NodeKind, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import WordEntry 

9 

10 

11def extract_etymology_section( 

12 wxr: WiktextractContext, 

13 page_data: list[WordEntry], 

14 base_data: WordEntry, 

15 level_node: WikiNode, 

16): 

17 from .page import match_sense_index 

18 

19 etymology_texts = defaultdict(list) 

20 has_list = False 

21 sense_index = "" 

22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

23 e_nodes = [] 

24 for node in list_item.children: 

25 if isinstance(node, str): 25 ↛ 35line 25 didn't jump to line 35 because the condition on line 25 was always true

26 m = re.search(r"\(([\d\s,-.]+)\)", node) 

27 if m is not None: 

28 sense_index = m.group(1) 

29 remain_str = node[m.end() :] 

30 if remain_str != "": 30 ↛ 24line 30 didn't jump to line 24 because the condition on line 30 was always true

31 e_nodes.append(remain_str) 

32 else: 

33 e_nodes.append(node) 

34 else: 

35 e_nodes.append(node) 

36 text = clean_node(wxr, None, e_nodes) 

37 if len(text) > 0: 37 ↛ 22line 37 didn't jump to line 22 because the condition on line 37 was always true

38 etymology_texts[sense_index].append(text) 

39 has_list = True 

40 if not has_list: 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 text = clean_node(wxr, None, level_node.children) 

42 if len(text) > 0: 

43 etymology_texts[sense_index].append(text) 

44 

45 for data in page_data: 

46 if data.lang_code == base_data.lang_code: 46 ↛ 45line 46 didn't jump to line 45 because the condition on line 46 was always true

47 for sense_index, texts in etymology_texts.items(): 

48 if sense_index == "" or match_sense_index(sense_index, data): 

49 data.etymology_texts = texts 

50 

51 base_data.etymology_texts = etymology_texts.get("", [])