Coverage for src/wiktextract/extractor/nl/linkage.py: 85%

104 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Linkage, WordEntry 

8from .tags import LIST_ITEM_TAG_TEMPLATES 

9 

10 

11def extract_linkage_section( 

12 wxr: WiktextractContext, 

13 word_entry: WordEntry, 

14 level_node: LevelNode, 

15 linkage_type: str, 

16) -> None: 

17 sense_index = 0 

18 sense = "" 

19 raw_tags = [] 

20 for node in level_node.children: 

21 if isinstance(node, TemplateNode): 

22 if node.template_name == "intens": 

23 # https://nl.wiktionary.org/wiki/Sjabloon:intens 

24 raw_tags = ["intensivering"] 

25 s_index_str = node.template_parameters.get(2, "").strip() 

26 if re.fullmatch(r"\d+", s_index_str): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true

27 sense_index = int(s_index_str) 

28 elif node.template_name == "L-top": 

29 second_arg = clean_node( 

30 wxr, None, node.template_parameters.get(2, "") 

31 ) 

32 m = re.search(r"\[(\d+)\]", second_arg) 

33 if m is not None: 33 ↛ 37line 33 didn't jump to line 37 because the condition on line 33 was always true

34 sense_index = int(m.group(1)) 

35 sense = second_arg[m.end() :].strip() 

36 else: 

37 sense = second_arg 

38 elif node.template_name == "L-bottom": 

39 sense = "" 

40 sense_index = 0 

41 elif node.template_name.startswith("nld-"): 

42 extract_nld_template(wxr, word_entry, node, linkage_type) 

43 elif node.template_name in ["expr", "fras"]: 43 ↛ 20line 43 didn't jump to line 20 because the condition on line 43 was always true

44 extract_expr_template(wxr, word_entry, node, linkage_type) 

45 elif isinstance(node, WikiNode): 

46 if node.kind == NodeKind.LINK: 

47 word = clean_node(wxr, None, node) 

48 if word != "": 48 ↛ 20line 48 didn't jump to line 20 because the condition on line 48 was always true

49 getattr(word_entry, linkage_type).append( 

50 Linkage( 

51 word=word, 

52 sense=sense, 

53 sense_index=sense_index, 

54 raw_tags=raw_tags, 

55 ) 

56 ) 

57 elif node.kind == NodeKind.LIST: 57 ↛ 20line 57 didn't jump to line 20 because the condition on line 57 was always true

58 for list_item in node.find_child(NodeKind.LIST_ITEM): 

59 extract_linkage_list_item( 

60 wxr, 

61 word_entry, 

62 list_item, 

63 linkage_type, 

64 sense, 

65 sense_index, 

66 ) 

67 

68 

69def extract_linkage_list_item( 

70 wxr: WiktextractContext, 

71 word_entry: WordEntry, 

72 list_item: WordEntry, 

73 linkage_type: str, 

74 sense: str, 

75 sense_index: str, 

76) -> None: 

77 linkage_list = getattr(word_entry, linkage_type) 

78 orig_len = len(linkage_list) 

79 tags = [] 

80 for index, node in enumerate(list_item.children): 

81 if isinstance(node, str): 

82 m = re.search(r"\[(\d+)\]", node) 

83 if m is not None: 

84 sense_index = int(m.group(1)) 

85 elif node.strip().startswith(("=", "–")): 

86 sense = clean_node(wxr, None, list_item.children[index:]).strip( 

87 "=– " 

88 ) 

89 if len(linkage_list) > orig_len: 

90 linkage_list[-1].sense = sense 

91 else: 

92 word_nodes = [ 

93 n 

94 for n in list_item.children[:index] 

95 if not isinstance(n, TemplateNode) 

96 ] 

97 word = clean_node(wxr, None, word_nodes) 

98 if word != "": 98 ↛ 107line 98 didn't jump to line 107 because the condition on line 98 was always true

99 linkage_list.append( 

100 Linkage( 

101 word=word, 

102 sense=sense, 

103 sense_index=sense_index, 

104 tags=tags, 

105 ) 

106 ) 

107 return 

108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

109 word = clean_node(wxr, None, node) 

110 if word != "": 110 ↛ 80line 110 didn't jump to line 80 because the condition on line 110 was always true

111 linkage_list.append( 

112 Linkage(word=word, sense=sense, sense_index=sense_index) 

113 ) 

114 elif isinstance(node, TemplateNode): 

115 if node.template_name == "expr": 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 extract_expr_template(wxr, word_entry, node, linkage_type) 

117 elif node.template_name in LIST_ITEM_TAG_TEMPLATES: 117 ↛ 80line 117 didn't jump to line 80 because the condition on line 117 was always true

118 if len(linkage_list) > orig_len: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 linkage_list[-1].tags.append( 

120 LIST_ITEM_TAG_TEMPLATES[node.template_name] 

121 ) 

122 else: 

123 tags.append(LIST_ITEM_TAG_TEMPLATES[node.template_name]) 

124 

125 

126def extract_nld_template( 

127 wxr: WiktextractContext, 

128 word_entry: WordEntry, 

129 t_node: TemplateNode, 

130 linkage_type: str, 

131) -> None: 

132 # https://nl.wiktionary.org/wiki/Sjabloon:nld-rashonden 

133 expanded_node = wxr.wtp.parse( 

134 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

135 ) 

136 sense_index_str = clean_node( 

137 wxr, None, t_node.template_parameters.get(1, "") 

138 ) 

139 sense_index = 0 

140 if re.fullmatch(r"\d+", sense_index_str): 140 ↛ 142line 140 didn't jump to line 142 because the condition on line 140 was always true

141 sense_index = int(sense_index_str) 

142 sense = "" 

143 for italic_node in expanded_node.find_child_recursively(NodeKind.ITALIC): 143 ↛ 148line 143 didn't jump to line 148 because the loop on line 143 didn't complete

144 for link_node in italic_node.find_child(NodeKind.LINK): 

145 sense = clean_node(wxr, None, link_node) 

146 break 

147 

148 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

149 for link_node in list_item.find_child(NodeKind.LINK): 

150 word = clean_node(wxr, None, link_node) 

151 if word != "": 151 ↛ 149line 151 didn't jump to line 149 because the condition on line 151 was always true

152 getattr(word_entry, linkage_type).append( 

153 Linkage(word=word, sense_index=sense_index, sense=sense) 

154 ) 

155 

156 

157def extract_expr_template( 

158 wxr: WiktextractContext, 

159 word_entry: WordEntry, 

160 t_node: TemplateNode, 

161 linkage_type: str, 

162) -> None: 

163 # https://nl.wiktionary.org/wiki/Sjabloon:expr 

164 # https://nl.wiktionary.org/wiki/Sjabloon:fras 

165 sense_index_str = clean_node( 

166 wxr, None, t_node.template_parameters.get("n", "") 

167 ) 

168 sense_index = 0 

169 if re.fullmatch(r"\d+", sense_index_str) is not None: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 sense_index = int(sense_index_str) 

171 sense_arg = 2 if t_node.template_name == "expr" else 3 

172 word_arg = 1 if t_node.template_name == "expr" else 2 

173 sense = clean_node(wxr, None, t_node.template_parameters.get(sense_arg, "")) 

174 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, "")) 

175 m = re.match(r"\[?(\d+)\]?", word) 

176 if m is not None: # should use "n" arg 

177 sense_index = int(m.group(1)) 

178 word = word[m.end() :].strip() 

179 if word != "": 179 ↛ exitline 179 didn't return from function 'extract_expr_template' because the condition on line 179 was always true

180 getattr(word_entry, linkage_type).append( 

181 Linkage(word=word, sense=sense, sense_index=sense_index) 

182 ) 

183 

184 

185def extract_fixed_preposition_section( 

186 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

187) -> None: 

188 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

189 word = clean_node(wxr, None, list_item.children) 

190 if len(word) > 0: 

191 word_entry.derived.append( 

192 Linkage(word=word, tags=["prepositional"]) 

193 )