Coverage for src/wiktextract/extractor/de/flexion.py: 94%

118 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor import NodeKind 

4from wikitextprocessor.parser import HTMLNode, TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def parse_flexion_page( 

13 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

14) -> None: 

15 # https://de.wiktionary.org/wiki/Hilfe:Flexionsseiten 

16 flexion_page = wxr.wtp.get_page_body( 

17 page_title, wxr.wtp.NAMESPACE_DATA["Flexion"]["id"] 

18 ) 

19 if flexion_page is None: 19 ↛ 20line 19 didn't jump to line 20 because the condition on line 19 was never true

20 return 

21 flexion_root = wxr.wtp.parse(flexion_page) 

22 for flexion_template in flexion_root.find_child_recursively( 

23 NodeKind.TEMPLATE 

24 ): 

25 if flexion_template.template_name.startswith("Deklinationsseite"): 

26 process_deklinationsseite_template( 

27 wxr, word_entry, flexion_template, page_title 

28 ) 

29 elif flexion_template.template_name.startswith("Deutsch Verb"): 

30 process_deutsch_verb_template( 

31 wxr, word_entry, flexion_template, page_title 

32 ) 

33 

34 

35@dataclass 

36class SpanHeader: 

37 text: str 

38 index: int 

39 span: int 

40 

41 

42def process_deklinationsseite_template( 

43 wxr: WiktextractContext, 

44 word_entry: WordEntry, 

45 template_node: TemplateNode, 

46 page_tite: str, 

47) -> None: 

48 # https://de.wiktionary.org/wiki/Vorlage:Deklinationsseite_Adjektiv 

49 expanded_template = wxr.wtp.parse( 

50 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

51 ) 

52 h4_text = "" 

53 for node in expanded_template.find_child(NodeKind.HTML | NodeKind.TABLE): 

54 if isinstance(node, HTMLNode) and node.tag == "h4": 

55 h4_text = clean_node(wxr, None, node) 

56 elif node.kind == NodeKind.TABLE: 56 ↛ 53line 56 didn't jump to line 53 because the condition on line 56 was always true

57 col_headers = [] 

58 has_article = False 

59 for row_node in node.find_child(NodeKind.TABLE_ROW): 

60 col_index = 0 

61 row_header = "" 

62 article = "" 

63 for cell_node in row_node.find_child( 

64 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

65 ): 

66 cell_text = clean_node(wxr, None, cell_node) 

67 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

68 if cell_text == "": 

69 continue 

70 elif cell_text in ("Artikel", "Wortform"): 

71 has_article = True 

72 continue 

73 elif "colspan" in cell_node.attrs: 

74 col_span = int(cell_node.attrs.get("colspan")) 

75 if col_span == 9: # new table 

76 has_article = False 

77 col_headers.clear() 

78 col_headers.append( 

79 SpanHeader(cell_text, col_index, col_span) 

80 ) 

81 col_index += col_span 

82 else: 

83 row_header = cell_text 

84 elif cell_node.kind == NodeKind.TABLE_CELL: 84 ↛ 63line 84 didn't jump to line 63 because the condition on line 84 was always true

85 if has_article and col_index % 2 == 0: 

86 article = cell_text 

87 else: 

88 form_text = "" 

89 if article not in ("", "—"): 

90 form_text = article + " " 

91 form_text += cell_text 

92 form = Form(form=form_text, source=page_tite) 

93 if h4_text != "": 93 ↛ 95line 93 didn't jump to line 95 because the condition on line 93 was always true

94 form.raw_tags.append(h4_text) 

95 if row_header != "": 

96 form.raw_tags.append(row_header) 

97 for col_header in col_headers: 

98 if ( 

99 col_header.text not in ("", "—") 

100 and col_index >= col_header.index 

101 and col_index 

102 < col_header.index + col_header.span 

103 ): 

104 form.raw_tags.append(col_header.text) 

105 if form.form not in ("", "—"): 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was always true

106 translate_raw_tags(form) 

107 word_entry.forms.append(form) 

108 col_index += int(cell_node.attrs.get("colspan", "1")) 

109 

110 

111def process_deutsch_verb_template( 

112 wxr: WiktextractContext, 

113 word_entry: WordEntry, 

114 template_node: TemplateNode, 

115 page_tite: str, 

116) -> None: 

117 # Vorlage:Deutsch Verb regelmäßig 

118 expanded_template = wxr.wtp.parse( 

119 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

120 ) 

121 for table in expanded_template.find_child_recursively(NodeKind.TABLE): 

122 col_headers = [] 

123 for row in table.find_child(NodeKind.TABLE_ROW): 

124 row_header = "" 

125 col_index = 0 

126 col_header_index = 0 

127 is_bold_col_header = all( 

128 c.contain_node(NodeKind.BOLD) 

129 for c in row.find_child(NodeKind.TABLE_CELL) 

130 if clean_node(wxr, None, c) != "" 

131 ) 

132 if ( 

133 len( 

134 list( 

135 row.find_child( 

136 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

137 ) 

138 ) 

139 ) 

140 == 1 

141 ): 

142 col_headers.clear() # new table 

143 for cell in row.find_child( 

144 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

145 ): 

146 cell_text = clean_node(wxr, None, cell) 

147 if cell_text in ( 

148 "Flexion der Verbaladjektive", 

149 "(nichterweiterte) Infinitive", 

150 ): 

151 break 

152 elif ( 

153 cell.kind == NodeKind.TABLE_HEADER_CELL 

154 and cell_text not in ("", "Person") 

155 ): 

156 colspan = int(cell.attrs.get("colspan", "1")) 

157 col_headers.append( 

158 SpanHeader( 

159 cell_text, 

160 col_header_index, 

161 colspan, 

162 ) 

163 ) 

164 col_header_index += colspan 

165 elif cell.kind == NodeKind.TABLE_CELL: 165 ↛ 143line 165 didn't jump to line 143 because the condition on line 165 was always true

166 if cell_text in ( 

167 "", 

168 "—", 

169 "Text", 

170 "Person", 

171 ) or cell_text.startswith("Flexion:"): 

172 col_index += 1 

173 elif ( 

174 cell.contain_node(NodeKind.BOLD) 

175 or ( 

176 len(list(cell.find_html("small"))) > 0 

177 and len(list(cell.filter_empty_str_child())) == 1 

178 ) 

179 # Vorlage:Deutsch Verb schwach untrennbar reflexiv 

180 or cell.attrs.get("bgcolor", "").lower() == "#f4f4f4" 

181 ): # header in cell 

182 colspan = int(cell.attrs.get("colspan", "1")) 

183 if is_bold_col_header: 

184 for bold_node in cell.find_child(NodeKind.BOLD): 

185 col_headers.append( 

186 SpanHeader( 

187 clean_node(wxr, None, bold_node), 

188 col_header_index, 

189 colspan, 

190 ) 

191 ) 

192 else: 

193 row_header = cell_text 

194 col_header_index += colspan 

195 else: 

196 for form_text in cell_text.splitlines(): 

197 form_text = form_text.strip(", ") 

198 form_raw_tag = "" 

199 if ":" in form_text: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 form_raw_tag, form_text = form_text.split( 

201 ":", 1 

202 ) 

203 form = Form( 

204 form=form_text.strip(), source=page_tite 

205 ) 

206 if form_raw_tag != "": 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true

207 form.raw_tags.append(form_raw_tag) 

208 if row_header != "": 

209 form.raw_tags.append(row_header) 

210 for col_header in col_headers: 

211 if ( 

212 col_index >= col_header.index 

213 and col_index 

214 < col_header.index + col_header.span 

215 ): 

216 if col_header.text.endswith("I"): 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true

217 form.raw_tags.append(col_header.text) 

218 else: 

219 for raw_tag in col_header.text.split(): 

220 form.raw_tags.append(raw_tag) 

221 translate_raw_tags(form) 

222 word_entry.forms.append(form) 

223 col_index += 1