Coverage for src/wiktextract/extractor/de/linkage.py: 89%

60 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2from typing import Optional 

3 

4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Linkage, WordEntry 

9from .tags import translate_raw_tags 

10from .utils import extract_sense_index 

11 

12 

13def extract_linkages( 

14 wxr: WiktextractContext, 

15 word_entry: WordEntry, 

16 level_node: LevelNode, 

17 linkage_type: str, 

18) -> None: 

19 linkage_list = [] 

20 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

21 process_linkage_list_item(wxr, list_item, linkage_list, linkage_type) 

22 pre_list = getattr(word_entry, linkage_type) 

23 pre_list.extend(linkage_list) 

24 

25 

26def process_linkage_list_item( 

27 wxr: WiktextractContext, 

28 list_item_node: WikiNode, 

29 linkage_list: list[Linkage], 

30 linkage_type: str, 

31) -> None: 

32 sense_idx = "" 

33 raw_tags = [] 

34 after_dash = False 

35 note_nodes = [] 

36 for child in list_item_node.children: 

37 if after_dash: 

38 note_nodes.append(child) 

39 elif isinstance(child, str): 

40 if child.startswith("["): 

41 sense_idx, _ = extract_sense_index(child) 

42 elif "," in child or ";" in child: 

43 raw_tags.clear() 

44 if linkage_type == "expressions" and contains_dash(child): 

45 after_dash = True 

46 note_nodes.append(child) 

47 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC: 

48 raw_tag = clean_node(wxr, None, child) 

49 if raw_tag.endswith(":"): 

50 raw_tags.append(raw_tag.strip(": ")) 

51 else: 

52 for link_node in child.find_child(NodeKind.LINK): 

53 link_text = clean_node(wxr, None, link_node) 

54 if link_text != "": 54 ↛ 52line 54 didn't jump to line 52 because the condition on line 54 was always true

55 linkage = Linkage( 

56 word=link_text, 

57 sense_index=sense_idx, 

58 raw_tags=raw_tags, 

59 ) 

60 translate_raw_tags(linkage) 

61 linkage_list.append(linkage) 

62 elif isinstance(child, TemplateNode) and child.template_name.endswith( 

63 "." 

64 ): 

65 raw_tag = clean_node(wxr, None, child) 

66 raw_tag = raw_tag.strip(",: ") 

67 if raw_tag != "": 67 ↛ 36line 67 didn't jump to line 36 because the condition on line 67 was always true

68 raw_tags.append(raw_tag) 

69 elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: 69 ↛ 36line 69 didn't jump to line 36 because the condition on line 69 was always true

70 word = clean_node(wxr, None, child) 

71 if not word.startswith("Verzeichnis:") and len(word) > 0: 71 ↛ 36line 71 didn't jump to line 36 because the condition on line 71 was always true

72 # https://de.wiktionary.org/wiki/Wiktionary:Verzeichnis 

73 # Links to this namesapce pages are ignored, 

74 # should find what it contains later 

75 linkage = Linkage( 

76 word=word, sense_index=sense_idx, raw_tags=raw_tags 

77 ) 

78 translate_raw_tags(linkage) 

79 linkage_list.append(linkage) 

80 

81 if len(note_nodes) > 0 and len(linkage_list) > 0: 

82 linkage_list[-1].note = clean_node(wxr, None, note_nodes).strip( 

83 "–—―‒- " 

84 ) 

85 

86 

87def process_link( 

88 wxr: WiktextractContext, 

89 sense_idx: str, 

90 link_node: WikiNode, 

91) -> Optional[Linkage]: 

92 word = clean_node(wxr, None, link_node) 

93 if word.startswith("Verzeichnis:") or len(word) == 0: 

94 # https://de.wiktionary.org/wiki/Wiktionary:Verzeichnis 

95 # Links to this namesapce pages are ignored, 

96 # should find what it contains later 

97 return None 

98 return Linkage(word=word, sense_index=sense_idx) 

99 

100 

101def contains_dash(text: str) -> bool: 

102 return re.search(r"[–—―‒-]", text) is not None