Coverage for src/wiktextract/extractor/de/linkage.py: 89%

80 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Descendant, Linkage, Translation, WordEntry 

9from .tags import translate_raw_tags 

10from .utils import extract_sense_index 

11 

12 

13def extract_linkages( 

14 wxr: WiktextractContext, 

15 word_entry: WordEntry, 

16 level_node: LevelNode, 

17 linkage_type: str, 

18) -> None: 

19 linkage_list = [] 

20 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

21 process_linkage_list_item(wxr, list_item, linkage_list, linkage_type) 

22 getattr(word_entry, linkage_type).extend(linkage_list) 

23 

24 

25def process_linkage_list_item( 

26 wxr: WiktextractContext, 

27 list_item_node: WikiNode, 

28 linkage_list: list[Linkage], 

29 linkage_type: str, 

30) -> None: 

31 sense_idx = "" 

32 raw_tags = [] 

33 after_dash = False 

34 note_nodes = [] 

35 for child in list_item_node.children: 

36 if after_dash: 

37 note_nodes.append(child) 

38 elif isinstance(child, str): 

39 if child.startswith("["): 

40 sense_idx, _ = extract_sense_index(child) 

41 elif "," in child or ";" in child: 

42 raw_tags.clear() 

43 if linkage_type == "expressions" and contains_dash(child): 

44 after_dash = True 

45 note_nodes.append(child) 

46 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC: 

47 raw_tag = clean_node(wxr, None, child) 

48 if raw_tag.endswith(":"): 

49 raw_tags.append(raw_tag.strip(": ")) 

50 else: 

51 for link_node in child.find_child(NodeKind.LINK): 

52 link_text = clean_node(wxr, None, link_node) 

53 if link_text != "": 53 ↛ 51line 53 didn't jump to line 51 because the condition on line 53 was always true

54 linkage = Linkage( 

55 word=link_text, 

56 sense_index=sense_idx, 

57 raw_tags=raw_tags, 

58 ) 

59 translate_raw_tags(linkage) 

60 linkage_list.append(linkage) 

61 elif isinstance(child, TemplateNode) and child.template_name.endswith( 

62 "." 

63 ): 

64 raw_tag = clean_node(wxr, None, child) 

65 raw_tag = raw_tag.strip(",: ") 

66 if raw_tag != "": 66 ↛ 35line 66 didn't jump to line 35 because the condition on line 66 was always true

67 raw_tags.append(raw_tag) 

68 elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: 68 ↛ 35line 68 didn't jump to line 35 because the condition on line 68 was always true

69 word = clean_node(wxr, None, child) 

70 if not word.startswith("Verzeichnis:") and len(word) > 0: 70 ↛ 35line 70 didn't jump to line 35 because the condition on line 70 was always true

71 # https://de.wiktionary.org/wiki/Wiktionary:Verzeichnis 

72 # ignore index namespace links 

73 linkage = Linkage( 

74 word=word, sense_index=sense_idx, raw_tags=raw_tags 

75 ) 

76 translate_raw_tags(linkage) 

77 linkage_list.append(linkage) 

78 

79 if len(note_nodes) > 0 and len(linkage_list) > 0: 

80 linkage_list[-1].note = clean_node(wxr, None, note_nodes).strip( 

81 "–—―‒- " 

82 ) 

83 

84 

85def contains_dash(text: str) -> bool: 

86 return re.search(r"[–—―‒-]", text) is not None 

87 

88 

89def extract_descendant_section( 

90 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

91) -> None: 

92 for list_node in level_node.find_child(NodeKind.LIST): 

93 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

94 extract_descendant_list_item(wxr, word_entry, list_item) 

95 

96 

97def extract_descendant_list_item( 

98 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode 

99) -> None: 

100 lang_name = "unknown" 

101 lang_code = "unknown" 

102 sense_index = "" 

103 for node in list_item.children: 

104 if isinstance(node, str) and node.strip().startswith("["): 

105 sense_index, _ = extract_sense_index(node) 

106 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

107 node_str = clean_node(wxr, None, node) 

108 if node_str.endswith(":"): 108 ↛ 103line 108 didn't jump to line 103 because the condition on line 108 was always true

109 lang_name = node_str.strip(": ") 

110 lang_code = name_to_code(lang_name, "de") or "unknown" 

111 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

112 node_str = clean_node(wxr, None, node) 

113 if node != "": 113 ↛ 103line 113 didn't jump to line 103 because the condition on line 113 was always true

114 word_entry.descendants.append( 

115 Descendant( 

116 lang=lang_name, 

117 lang_code=lang_code, 

118 word=node_str, 

119 sense_index=sense_index, 

120 ) 

121 ) 

122 elif isinstance(node, TemplateNode) and node.template_name.startswith( 122 ↛ 125line 122 didn't jump to line 125 because the condition on line 122 was never true

123 "Ü" 

124 ): 

125 from .translation import process_u_template 

126 

127 tr_data = Translation(lang=lang_name, lang_code=lang_code) 

128 process_u_template(wxr, tr_data, node) 

129 if tr_data.word != "": 

130 word_entry.descendants.append( 

131 Descendant( 

132 lang=tr_data.lang, 

133 lang_code=tr_data.lang_code, 

134 word=tr_data.word, 

135 roman=tr_data.roman, 

136 sense_index=sense_index, 

137 ) 

138 )