MiniDNN
Convolutional_DHT.h
1 #ifndef LAYER_CONVOLUTIONAL_H_
2 #define LAYER_CONVOLUTIONAL_H_
3 
4 #include <Eigen/Core>
5 #include <vector>
6 #include <stdexcept>
7 #include "../Config.h"
8 #include "../Layer.h"
9 #include "../Utils/Convolution.h"
10 #include "../Utils/Convolution_DHT.h"
11 #include "../Utils/Random.h"
12 
13 namespace MiniDNN {
14 
15 
23 template <typename Activation>
24 class Convolutional: public Layer
25 {
26 private:
27  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> Matrix;
28  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
29  typedef Matrix::ConstAlignedMapType ConstAlignedMapMat;
30  typedef Vector::ConstAlignedMapType ConstAlignedMapVec;
31  typedef Vector::AlignedMapType AlignedMapVec;
32 
33  const internal::ConvDims m_dim; // Various dimensions of convolution
34 
35  Vector m_filter_data; // Filter parameters. Total length is
36  // (in_channels x out_channels x filter_rows x filter_cols)
37  // See Utils/Convolution.h for its layout
38 
39  Vector m_df_data; // Derivative of filters, same dimension as m_filter_data
40 
41  Vector m_bias; // Bias term for the output channels, out_channels x 1. (One bias term per channel)
42  Vector m_db; // Derivative of bias, same dimension as m_bias
43 
44  Matrix m_z; // Linear term, z = conv(in, w) + b. Each column is an observation
45  Matrix m_a; // Output of this layer, a = act(z)
46  Matrix m_din; // Derivative of the input of this layer
47  // Note that input of this layer is also the output of previous layer
48  internal::DHT2D m_dht;
49  Matrix m_in_dht;
50  Matrix m_filter_dht;
51  Matrix m_dLz_dht;
52 
53 public:
64  Convolutional(const int in_width, const int in_height,
65  const int in_channels, const int out_channels,
66  const int window_width, const int window_height) :
67  Layer(in_width * in_height * in_channels,
68  (in_width - window_width + 1) * (in_height - window_height + 1) * out_channels),
69  m_dim(in_channels, out_channels, in_height, in_width, window_height, window_width),
70  m_dht(in_height, in_width)
71  {}
72 
73  void init(const Scalar& mu, const Scalar& sigma, RNG& rng)
74  {
75  // Set data dimension
76  const int filter_data_size = m_dim.in_channels * m_dim.out_channels * m_dim.filter_rows * m_dim.filter_cols;
77  m_filter_data.resize(filter_data_size);
78  m_df_data.resize(filter_data_size);
79 
80  // Random initialization of filter parameters
81  internal::set_normal_random(m_filter_data.data(), filter_data_size, rng, mu, sigma);
82 
83  // Bias term
84  m_bias.resize(m_dim.out_channels);
85  m_db.resize(m_dim.out_channels);
86  internal::set_normal_random(m_bias.data(), m_dim.out_channels, rng, mu, sigma);
87  }
88 
89  // http://cs231n.github.io/convolutional-networks/
90  void forward(const Matrix& prev_layer_data)
91  {
92  // Each column is an observation
93  const int nobs = prev_layer_data.cols();
94 
95  // Linear term, z = conv(in, w) + b
96  m_z.resize(this->m_out_size, nobs);
97  // Convolution
98  const int in_dht_depth = m_dim.in_channels * nobs;
99  m_dht.pseudo_transform(
100  ConstAlignedMapMat(prev_layer_data.data(), m_dim.channel_rows, m_dim.channel_cols * in_dht_depth),
101  in_dht_depth,
102  m_in_dht
103  );
104  const int filter_dht_depth = m_dim.in_channels * m_dim.out_channels;
105  m_dht.pseudo_transform_padding(
106  ConstAlignedMapMat(m_filter_data.data(), m_dim.filter_rows, m_dim.filter_cols * filter_dht_depth),
107  filter_dht_depth, true,
108  m_filter_dht
109  );
110  internal::convolve_valid_dht(m_dim, m_dht,
111  m_in_dht, m_filter_dht, true, nobs, m_z.data());
112  // Add bias terms
113  // Each column of m_z contains m_dim.out_channels channels, and each channel has
114  // m_dim.conv_rows * m_dim.conv_cols elements
115  int channel_start_row = 0;
116  const int channel_nelem = m_dim.conv_rows * m_dim.conv_cols;
117  for(int i = 0; i < m_dim.out_channels; i++, channel_start_row += channel_nelem)
118  {
119  m_z.block(channel_start_row, 0, channel_nelem, nobs).array() += m_bias[i];
120  }
121 
122  // Apply activation function
123  m_a.resize(this->m_out_size, nobs);
124  Activation::activate(m_z, m_a);
125  }
126 
127  const Matrix& output() const
128  {
129  return m_a;
130  }
131 
132  // prev_layer_data: in_size x nobs
133  // next_layer_data: out_size x nobs
134  // https://grzegorzgwardys.wordpress.com/2016/04/22/8/
135  void backprop(const Matrix& prev_layer_data, const Matrix& next_layer_data)
136  {
137  const int nobs = prev_layer_data.cols();
138 
139  // After forward stage, m_z contains z = conv(in, w) + b
140  // Now we need to calculate d(L) / d(z) = [d(a) / d(z)] * [d(L) / d(a)]
141  // d(L) / d(a) is computed in the next layer, contained in next_layer_data
142  // The Jacobian matrix J = d(a) / d(z) is determined by the activation function
143  Matrix& dLz = m_z;
144  Activation::apply_jacobian(m_z, m_a, next_layer_data, dLz);
145 
146  // z_j = sum_i(conv(in_i, w_ij)) + b_j
147  //
148  // d(z_k) / d(w_ij) = 0, if k != j
149  // d(L) / d(w_ij) = [d(z_j) / d(w_ij)] * [d(L) / d(z_j)] = sum_i{ [d(z_j) / d(w_ij)] * [d(L) / d(z_j)] }
150  // = sum_i(conv(in_i, d(L) / d(z_j)))
151  //
152  // z_j is an image (matrix), b_j is a scalar
153  // d(z_j) / d(b_j) = a matrix of the same size of d(z_j) filled with 1
154  // d(L) / d(b_j) = (d(L) / d(z_j)).sum()
155  //
156  // d(z_j) / d(in_i) = conv_full_op(w_ij_rotate)
157  // d(L) / d(in_i) = sum_j((d(z_j) / d(in_i)) * (d(L) / d(z_j))) = sum_j(conv_full(d(L) / d(z_j), w_ij_rotate))
158 
159  // Derivative for weights
160  internal::ConvDims back_conv_dim(nobs, m_dim.out_channels, m_dim.channel_rows, m_dim.channel_cols,
161  m_dim.conv_rows, m_dim.conv_cols);
162  const int dLz_dht_depth = back_conv_dim.in_channels * back_conv_dim.out_channels;
163  m_dht.pseudo_transform_padding(
164  ConstAlignedMapMat(dLz.data(), back_conv_dim.filter_rows, back_conv_dim.filter_cols * dLz_dht_depth),
165  dLz_dht_depth, true,
166  m_dLz_dht
167  );
168  internal::convolve_valid_dht(back_conv_dim, m_dht,
169  m_in_dht, m_dLz_dht, false, m_dim.in_channels, m_df_data.data());
170  m_df_data /= nobs;
171 
172  // Derivative for bias
173  // Aggregate d(L) / d(z) in each output channel
174  ConstAlignedMapMat dLz_by_channel(dLz.data(), m_dim.conv_rows * m_dim.conv_cols, m_dim.out_channels * nobs);
175  Vector dLb = dLz_by_channel.colwise().sum();
176  // Average over observations
177  ConstAlignedMapMat dLb_by_obs(dLb.data(), m_dim.out_channels, nobs);
178  m_db.noalias() = dLb_by_obs.rowwise().mean();
179 
180  // Compute d(L) / d_in = conv_full(d(L) / d(z), w_rotate)
181  m_din.resize(this->m_in_size, nobs);
182  internal::ConvDims conv_full_dim(m_dim.out_channels, m_dim.in_channels, m_dim.conv_rows, m_dim.conv_cols, m_dim.filter_rows, m_dim.filter_cols);
183  const int filter_dht_depth = m_dim.in_channels * m_dim.out_channels;
184  m_dht.pseudo_transform_padding(
185  ConstAlignedMapMat(m_filter_data.data(), m_dim.filter_rows, m_dim.filter_cols * filter_dht_depth),
186  filter_dht_depth, false,
187  m_filter_dht
188  );
189  internal::convolve_full_dht(conv_full_dim, m_dht,
190  m_dLz_dht, m_filter_dht, nobs, m_din.data());
191  }
192 
193  const Matrix& backprop_data() const
194  {
195  return m_din;
196  }
197 
198  void update(Optimizer& opt)
199  {
200  ConstAlignedMapVec dw(m_df_data.data(), m_df_data.size());
201  ConstAlignedMapVec db(m_db.data(), m_db.size());
202  AlignedMapVec w(m_filter_data.data(), m_filter_data.size());
203  AlignedMapVec b(m_bias.data(), m_bias.size());
204 
205  opt.update(dw, w);
206  opt.update(db, b);
207  }
208 
209  std::vector<Scalar> get_parameters() const
210  {
211  std::vector<Scalar> res(m_filter_data.size() + m_bias.size());
212  // Copy the data of filters and bias to a long vector
213  std::copy(m_filter_data.data(), m_filter_data.data() + m_filter_data.size(), res.begin());
214  std::copy(m_bias.data(), m_bias.data() + m_bias.size(), res.begin() + m_filter_data.size());
215 
216  return res;
217  }
218 
219  void set_parameters(const std::vector<Scalar>& param)
220  {
221  if(static_cast<int>(param.size()) != m_filter_data.size() + m_bias.size())
222  throw std::invalid_argument("Parameter size does not match");
223 
224  std::copy(param.begin(), param.begin() + m_filter_data.size(), m_filter_data.data());
225  std::copy(param.begin() + m_filter_data.size(), param.end(), m_bias.data());
226  }
227 
228  std::vector<Scalar> get_derivatives() const
229  {
230  std::vector<Scalar> res(m_df_data.size() + m_db.size());
231  // Copy the data of filters and bias to a long vector
232  std::copy(m_df_data.data(), m_df_data.data() + m_df_data.size(), res.begin());
233  std::copy(m_db.data(), m_db.data() + m_db.size(), res.begin() + m_df_data.size());
234 
235  return res;
236  }
237 };
238 
239 
240 } // namespace MiniDNN
241 
242 
243 #endif /* LAYER_CONVOLUTIONAL_H_ */
void init(const Scalar &mu, const Scalar &sigma, RNG &rng)
std::vector< Scalar > get_parameters() const
void set_parameters(const std::vector< Scalar > &param)
const Matrix & output() const
void update(Optimizer &opt)
Convolutional(const int in_width, const int in_height, const int in_channels, const int out_channels, const int window_width, const int window_height)
std::vector< Scalar > get_derivatives() const
void backprop(const Matrix &prev_layer_data, const Matrix &next_layer_data)
const Matrix & backprop_data() const
virtual void update(ConstAlignedMapVec &dvec, AlignedMapVec &vec)=0
void forward(const Matrix &prev_layer_data)