Coverage for src/wiktextract/extractor/nl/linkage.py: 85%

1import re

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .models import Linkage, WordEntry

8from .tags import LIST_ITEM_TAG_TEMPLATES

11def extract_linkage_section(

12 wxr: WiktextractContext,

13 word_entry: WordEntry,

14 level_node: LevelNode,

15 linkage_type: str,

16) -> None:

17 sense_index = 0

18 sense = ""

19 raw_tags = []

20 for node in level_node.children:

21 if isinstance(node, TemplateNode):

22 if node.template_name == "intens":

23 # https://nl.wiktionary.org/wiki/Sjabloon:intens

24 raw_tags = ["intensivering"]

25 s_index_str = node.template_parameters.get(2, "").strip()

26 if re.fullmatch(r"\d+", s_index_str): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true

27 sense_index = int(s_index_str)

28 elif node.template_name == "L-top":

29 second_arg = clean_node(

30 wxr, None, node.template_parameters.get(2, "")

31 )

32 m = re.search(r"\[(\d+)\]", second_arg)

33 if m is not None: 33 ↛ 37line 33 didn't jump to line 37 because the condition on line 33 was always true

34 sense_index = int(m.group(1))

35 sense = second_arg[m.end() :].strip()

36 else:

37 sense = second_arg

38 elif node.template_name == "L-bottom":

39 sense = ""

40 sense_index = 0

41 elif node.template_name.startswith("nld-"):

42 extract_nld_template(wxr, word_entry, node, linkage_type)

43 elif node.template_name in ["expr", "fras"]: 43 ↛ 20line 43 didn't jump to line 20 because the condition on line 43 was always true

44 extract_expr_template(wxr, word_entry, node, linkage_type)

45 elif isinstance(node, WikiNode):

46 if node.kind == NodeKind.LINK:

47 word = clean_node(wxr, None, node)

48 if word != "": 48 ↛ 20line 48 didn't jump to line 20 because the condition on line 48 was always true

49 getattr(word_entry, linkage_type).append(

50 Linkage(

51 word=word,

52 sense=sense,

53 sense_index=sense_index,

54 raw_tags=raw_tags,

55 )

56 )

57 elif node.kind == NodeKind.LIST: 57 ↛ 20line 57 didn't jump to line 20 because the condition on line 57 was always true

58 for list_item in node.find_child(NodeKind.LIST_ITEM):

59 extract_linkage_list_item(

60 wxr,

61 word_entry,

62 list_item,

63 linkage_type,

64 sense,

65 sense_index,

66 )

69def extract_linkage_list_item(

70 wxr: WiktextractContext,

71 word_entry: WordEntry,

72 list_item: WordEntry,

73 linkage_type: str,

74 sense: str,

75 sense_index: str,

76) -> None:

77 linkage_list = getattr(word_entry, linkage_type)

78 orig_len = len(linkage_list)

79 tags = []

80 for index, node in enumerate(list_item.children):

81 if isinstance(node, str):

82 m = re.search(r"\[(\d+)\]", node)

83 if m is not None:

84 sense_index = int(m.group(1))

85 elif node.strip().startswith(("=", "–")):

86 sense = clean_node(wxr, None, list_item.children[index:]).strip(

87 "=– "

88 )

89 if len(linkage_list) > orig_len:

90 linkage_list[-1].sense = sense

91 else:

92 word_nodes = [

93 n

94 for n in list_item.children[:index]

95 if not isinstance(n, TemplateNode)

96 ]

97 word = clean_node(wxr, None, word_nodes)

98 if word != "": 98 ↛ 107line 98 didn't jump to line 107 because the condition on line 98 was always true

99 linkage_list.append(

100 Linkage(

101 word=word,

102 sense=sense,

103 sense_index=sense_index,

104 tags=tags,

105 )

106 )

107 return

108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

109 word = clean_node(wxr, None, node)

110 if word != "": 110 ↛ 80line 110 didn't jump to line 80 because the condition on line 110 was always true

111 linkage_list.append(

112 Linkage(word=word, sense=sense, sense_index=sense_index)

113 )

114 elif isinstance(node, TemplateNode):

115 if node.template_name == "expr": 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 extract_expr_template(wxr, word_entry, node, linkage_type)

117 elif node.template_name in LIST_ITEM_TAG_TEMPLATES: 117 ↛ 80line 117 didn't jump to line 80 because the condition on line 117 was always true

118 if len(linkage_list) > orig_len: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 linkage_list[-1].tags.append(

120 LIST_ITEM_TAG_TEMPLATES[node.template_name]

121 )

122 else:

123 tags.append(LIST_ITEM_TAG_TEMPLATES[node.template_name])

124

125

126def extract_nld_template(

127 wxr: WiktextractContext,

128 word_entry: WordEntry,

129 t_node: TemplateNode,

130 linkage_type: str,

131) -> None:

132 # https://nl.wiktionary.org/wiki/Sjabloon:nld-rashonden

133 expanded_node = wxr.wtp.parse(

134 wxr.wtp.node_to_wikitext(t_node), expand_all=True

135 )

136 sense_index_str = clean_node(

137 wxr, None, t_node.template_parameters.get(1, "")

138 )

139 sense_index = 0

140 if re.fullmatch(r"\d+", sense_index_str): 140 ↛ 142line 140 didn't jump to line 142 because the condition on line 140 was always true

141 sense_index = int(sense_index_str)

142 sense = ""

143 for italic_node in expanded_node.find_child_recursively(NodeKind.ITALIC): 143 ↛ 148line 143 didn't jump to line 148 because the loop on line 143 didn't complete

144 for link_node in italic_node.find_child(NodeKind.LINK):

145 sense = clean_node(wxr, None, link_node)

146 break

147

148 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):

149 for link_node in list_item.find_child(NodeKind.LINK):

150 word = clean_node(wxr, None, link_node)

151 if word != "": 151 ↛ 149line 151 didn't jump to line 149 because the condition on line 151 was always true

152 getattr(word_entry, linkage_type).append(

153 Linkage(word=word, sense_index=sense_index, sense=sense)

154 )

155

156

157def extract_expr_template(

158 wxr: WiktextractContext,

159 word_entry: WordEntry,

160 t_node: TemplateNode,

161 linkage_type: str,

162) -> None:

163 # https://nl.wiktionary.org/wiki/Sjabloon:expr

164 # https://nl.wiktionary.org/wiki/Sjabloon:fras

165 sense_index_str = clean_node(

166 wxr, None, t_node.template_parameters.get("n", "")

167 )

168 sense_index = 0

169 if re.fullmatch(r"\d+", sense_index_str) is not None: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 sense_index = int(sense_index_str)

171 sense_arg = 2 if t_node.template_name == "expr" else 3

172 word_arg = 1 if t_node.template_name == "expr" else 2

173 sense = clean_node(wxr, None, t_node.template_parameters.get(sense_arg, ""))

174 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))

175 m = re.match(r"\[?(\d+)\]?", word)

176 if m is not None: # should use "n" arg

177 sense_index = int(m.group(1))

178 word = word[m.end() :].strip()

179 if word != "": 179 ↛ exitline 179 didn't return from function 'extract_expr_template' because the condition on line 179 was always true

180 getattr(word_entry, linkage_type).append(

181 Linkage(word=word, sense=sense, sense_index=sense_index)

182 )

183

184

185def extract_fixed_preposition_section(

186 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

187) -> None:

188 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

189 word = clean_node(wxr, None, list_item.children)

190 if len(word) > 0:

191 word_entry.derived.append(

192 Linkage(word=word, tags=["prepositional"])

193 )