Coverage for src/wiktextract/extractor/nl/inflection.py: 96%

126 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_inflection_template( 

13 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

14) -> None: 

15 if t_node.template_name in ["-nlnoun-", "adjcomp"]: 

16 extract_noun_adj_table(wxr, word_entry, t_node) 

17 elif t_node.template_name == "-nlstam-": 

18 extract_nlstam_template(wxr, word_entry, t_node) 

19 

20 

21def extract_noun_adj_table( 

22 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

23) -> None: 

24 # https://nl.wiktionary.org/wiki/Sjabloon:-nlnoun- 

25 # https://nl.wiktionary.org/wiki/Sjabloon:adjcomp 

26 expanded_node = wxr.wtp.parse( 

27 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

28 ) 

29 column_headers = [] 

30 for table_node in expanded_node.find_child(NodeKind.TABLE): 

31 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

32 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

33 header_text = clean_node(wxr, None, header_node) 

34 if header_text != "": 

35 column_headers.append(header_text) 

36 row_header = "" 

37 for col_index, data_node in enumerate( 

38 row_node.find_child(NodeKind.TABLE_CELL) 

39 ): 

40 if col_index == 0: 

41 row_header = clean_node(wxr, None, data_node) 

42 else: 

43 for form_str in clean_node( 

44 wxr, None, data_node 

45 ).splitlines(): 

46 if form_str not in ["", "-", wxr.wtp.title]: 

47 form = Form(form=form_str) 

48 if row_header not in ["", "naamwoord"]: 

49 form.raw_tags.append(row_header) 

50 if col_index - 1 < len(column_headers): 50 ↛ 54line 50 didn't jump to line 54 because the condition on line 50 was always true

51 form.raw_tags.append( 

52 column_headers[col_index - 1] 

53 ) 

54 translate_raw_tags(form) 

55 word_entry.forms.append(form) 

56 

57 for link_node in expanded_node.find_child(NodeKind.LINK): 

58 clean_node(wxr, word_entry, link_node) 

59 

60 

61def extract_nlstam_template( 

62 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

63) -> None: 

64 # verb table 

65 # https://nl.wiktionary.org/wiki/Sjabloon:-nlstam- 

66 for arg in [2, 3]: 

67 form_str = clean_node( 

68 wxr, None, t_node.template_parameters.get(arg, "") 

69 ) 

70 if form_str != "": 70 ↛ 66line 70 didn't jump to line 66 because the condition on line 70 was always true

71 form = Form( 

72 form=form_str, 

73 ipa=clean_node( 

74 wxr, None, t_node.template_parameters.get(arg + 3, "") 

75 ), 

76 ) 

77 form.tags.extend(["past"] if arg == 2 else ["past", "participle"]) 

78 word_entry.forms.append(form) 

79 clean_node(wxr, word_entry, t_node) 

80 extract_vervoeging_page(wxr, word_entry) 

81 

82 

83def extract_vervoeging_page( 

84 wxr: WiktextractContext, word_entry: WordEntry 

85) -> None: 

86 page = wxr.wtp.get_page(f"{wxr.wtp.title}/vervoeging", 0) 

87 if page is None: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 return 

89 root = wxr.wtp.parse(page.body) 

90 for t_node in root.find_child(NodeKind.TEMPLATE): 

91 if t_node.template_name == "-nlverb-": 91 ↛ 90line 91 didn't jump to line 90 because the condition on line 91 was always true

92 extract_nlverb_template(wxr, word_entry, t_node) 

93 

94 

95@dataclass 

96class TableHeader: 

97 text: str 

98 col_index: int 

99 colspan: int 

100 row_index: int 

101 rowspan: int 

102 

103 

104NLVERB_HEADER_PREFIXES = { 

105 "vervoeging van de bedrijvende vorm van": ["active"], 

106 "onpersoonlijke lijdende vorm": ["impersonal", "passive"], 

107 "lijdende vorm": ["passive"], 

108} 

109 

110 

111def extract_nlverb_template( 

112 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

113) -> None: 

114 # https://nl.wiktionary.org/wiki/Sjabloon:-nlverb- 

115 expanded_node = wxr.wtp.parse( 

116 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

117 ) 

118 for link_node in expanded_node.find_child(NodeKind.LINK): 

119 clean_node(wxr, word_entry, link_node) 

120 for table_node in expanded_node.find_child(NodeKind.TABLE): 

121 row_index = 0 

122 shared_tags = [] 

123 shared_raw_tags = [] 

124 last_row_all_header = False 

125 col_headers = [] 

126 row_headers = [] 

127 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

128 col_index = 0 

129 for row_header in row_headers: 

130 if ( 

131 row_index >= row_header.row_index 

132 and row_index < row_header.row_index + row_header.rowspan 

133 ): 

134 col_index += row_header.rowspan 

135 

136 current_row_all_header = all( 

137 nlverb_table_cell_is_header(n) 

138 for n in row_node.find_child( 

139 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

140 ) 

141 ) 

142 if current_row_all_header and not last_row_all_header: 

143 row_index = 0 

144 shared_tags.clear() 

145 shared_raw_tags.clear() 

146 col_headers.clear() 

147 row_headers.clear() 

148 

149 is_row_first_node = True 

150 for cell_node in row_node.find_child( 

151 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

152 ): 

153 cell_colspan = 1 

154 cell_colspan_str = cell_node.attrs.get("colspan", "1") 

155 if re.fullmatch(r"\d+", cell_colspan_str): 155 ↛ 157line 155 didn't jump to line 157 because the condition on line 155 was always true

156 cell_colspan = int(cell_colspan_str) 

157 cell_rowspan = 1 

158 cell_rowspan_str = cell_node.attrs.get("rowspan", "1") 

159 if re.fullmatch(r"\d+", cell_rowspan_str): 159 ↛ 161line 159 didn't jump to line 161 because the condition on line 159 was always true

160 cell_rowspan = int(cell_rowspan_str) 

161 cell_str = clean_node(wxr, None, cell_node) 

162 if cell_str in ["", wxr.wtp.title]: 

163 col_index += cell_colspan 

164 is_row_first_node = False 

165 continue 

166 if nlverb_table_cell_is_header(cell_node): 

167 for ( 

168 header_prefix, 

169 prefix_tags, 

170 ) in NLVERB_HEADER_PREFIXES.items(): 

171 if cell_str.startswith(header_prefix): 

172 shared_tags.extend(prefix_tags) 

173 break 

174 else: 

175 if current_row_all_header: 

176 if is_row_first_node: 

177 shared_raw_tags.append(cell_str) 

178 else: 

179 col_headers.append( 

180 TableHeader( 

181 cell_str, 

182 col_index, 

183 cell_colspan, 

184 row_index, 

185 cell_rowspan, 

186 ) 

187 ) 

188 else: 

189 if "(" in cell_str: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 cell_str = cell_str[ 

191 : cell_str.index("(") 

192 ].strip() 

193 row_headers.append( 

194 TableHeader( 

195 cell_str, 

196 col_index, 

197 cell_colspan, 

198 row_index, 

199 cell_rowspan, 

200 ) 

201 ) 

202 else: 

203 form = Form( 

204 form=cell_str, 

205 tags=shared_tags, 

206 raw_tags=shared_raw_tags, 

207 source=f"{wxr.wtp.title}/vervoeging", 

208 ) 

209 for row_header in row_headers: 

210 if ( 

211 row_index >= row_header.row_index 

212 and row_index 

213 < row_header.row_index + row_header.rowspan 

214 ): 

215 form.raw_tags.append(row_header.text) 

216 for col_header in col_headers: 

217 if ( 

218 col_index >= col_header.col_index 

219 and col_index 

220 < col_header.col_index + col_header.colspan 

221 ): 

222 form.raw_tags.append(col_header.text) 

223 translate_raw_tags(form) 

224 word_entry.forms.append(form) 

225 

226 col_index += cell_colspan 

227 is_row_first_node = False 

228 

229 row_index += 1 

230 last_row_all_header = current_row_all_header 

231 

232 

233def nlverb_table_cell_is_header(node: WikiNode) -> bool: 

234 return ( 

235 node.kind == NodeKind.TABLE_HEADER_CELL 

236 or node.attrs.get("class", "") == "infoboxrijhoofding" 

237 )