MiniDNN
Convolutional.h
1 #ifndef LAYER_CONVOLUTIONAL_H_
2 #define LAYER_CONVOLUTIONAL_H_
3 
4 #include <Eigen/Core>
5 #include <vector>
6 #include <stdexcept>
7 #include "../Config.h"
8 #include "../Layer.h"
9 #include "../Utils/Convolution.h"
10 #include "../Utils/Random.h"
11 
12 namespace MiniDNN {
13 
14 
22 template <typename Activation>
23 class Convolutional: public Layer
24 {
25 private:
26  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> Matrix;
27  typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
28  typedef Matrix::ConstAlignedMapType ConstAlignedMapMat;
29  typedef Vector::ConstAlignedMapType ConstAlignedMapVec;
30  typedef Vector::AlignedMapType AlignedMapVec;
31 
32  const internal::ConvDims m_dim; // Various dimensions of convolution
33 
34  Vector m_filter_data; // Filter parameters. Total length is
35  // (in_channels x out_channels x filter_rows x filter_cols)
36  // See Utils/Convolution.h for its layout
37 
38  Vector m_df_data; // Derivative of filters, same dimension as m_filter_data
39 
40  Vector m_bias; // Bias term for the output channels, out_channels x 1. (One bias term per channel)
41  Vector m_db; // Derivative of bias, same dimension as m_bias
42 
43  Matrix m_z; // Linear term, z = conv(in, w) + b. Each column is an observation
44  Matrix m_a; // Output of this layer, a = act(z)
45  Matrix m_din; // Derivative of the input of this layer
46  // Note that input of this layer is also the output of previous layer
47 
48 public:
59  Convolutional(const int in_width, const int in_height,
60  const int in_channels, const int out_channels,
61  const int window_width, const int window_height) :
62  Layer(in_width * in_height * in_channels,
63  (in_width - window_width + 1) * (in_height - window_height + 1) * out_channels),
64  m_dim(in_channels, out_channels, in_height, in_width, window_height, window_width)
65  {}
66 
67  void init(const Scalar& mu, const Scalar& sigma, RNG& rng)
68  {
69  // Set data dimension
70  const int filter_data_size = m_dim.in_channels * m_dim.out_channels * m_dim.filter_rows * m_dim.filter_cols;
71  m_filter_data.resize(filter_data_size);
72  m_df_data.resize(filter_data_size);
73 
74  // Random initialization of filter parameters
75  internal::set_normal_random(m_filter_data.data(), filter_data_size, rng, mu, sigma);
76 
77  // Bias term
78  m_bias.resize(m_dim.out_channels);
79  m_db.resize(m_dim.out_channels);
80  internal::set_normal_random(m_bias.data(), m_dim.out_channels, rng, mu, sigma);
81  }
82 
83  // http://cs231n.github.io/convolutional-networks/
84  void forward(const Matrix& prev_layer_data)
85  {
86  // Each column is an observation
87  const int nobs = prev_layer_data.cols();
88 
89  // Linear term, z = conv(in, w) + b
90  m_z.resize(this->m_out_size, nobs);
91  // Convolution
92  internal::convolve_valid(m_dim, prev_layer_data.data(), true, nobs,
93  m_filter_data.data(), m_z.data()
94  );
95  // Add bias terms
96  // Each column of m_z contains m_dim.out_channels channels, and each channel has
97  // m_dim.conv_rows * m_dim.conv_cols elements
98  int channel_start_row = 0;
99  const int channel_nelem = m_dim.conv_rows * m_dim.conv_cols;
100  for(int i = 0; i < m_dim.out_channels; i++, channel_start_row += channel_nelem)
101  {
102  m_z.block(channel_start_row, 0, channel_nelem, nobs).array() += m_bias[i];
103  }
104 
105  // Apply activation function
106  m_a.resize(this->m_out_size, nobs);
107  Activation::activate(m_z, m_a);
108  }
109 
110  const Matrix& output() const
111  {
112  return m_a;
113  }
114 
115  // prev_layer_data: in_size x nobs
116  // next_layer_data: out_size x nobs
117  // https://grzegorzgwardys.wordpress.com/2016/04/22/8/
118  void backprop(const Matrix& prev_layer_data, const Matrix& next_layer_data)
119  {
120  const int nobs = prev_layer_data.cols();
121 
122  // After forward stage, m_z contains z = conv(in, w) + b
123  // Now we need to calculate d(L) / d(z) = [d(a) / d(z)] * [d(L) / d(a)]
124  // d(L) / d(a) is computed in the next layer, contained in next_layer_data
125  // The Jacobian matrix J = d(a) / d(z) is determined by the activation function
126  Matrix& dLz = m_z;
127  Activation::apply_jacobian(m_z, m_a, next_layer_data, dLz);
128 
129  // z_j = sum_i(conv(in_i, w_ij)) + b_j
130  //
131  // d(z_k) / d(w_ij) = 0, if k != j
132  // d(L) / d(w_ij) = [d(z_j) / d(w_ij)] * [d(L) / d(z_j)] = sum_i{ [d(z_j) / d(w_ij)] * [d(L) / d(z_j)] }
133  // = sum_i(conv(in_i, d(L) / d(z_j)))
134  //
135  // z_j is an image (matrix), b_j is a scalar
136  // d(z_j) / d(b_j) = a matrix of the same size of d(z_j) filled with 1
137  // d(L) / d(b_j) = (d(L) / d(z_j)).sum()
138  //
139  // d(z_j) / d(in_i) = conv_full_op(w_ij_rotate)
140  // d(L) / d(in_i) = sum_j((d(z_j) / d(in_i)) * (d(L) / d(z_j))) = sum_j(conv_full(d(L) / d(z_j), w_ij_rotate))
141 
142  // Derivative for weights
143  internal::ConvDims back_conv_dim(nobs, m_dim.out_channels, m_dim.channel_rows, m_dim.channel_cols,
144  m_dim.conv_rows, m_dim.conv_cols);
145  internal::convolve_valid(back_conv_dim, prev_layer_data.data(), false, m_dim.in_channels,
146  dLz.data(), m_df_data.data()
147  );
148  m_df_data /= nobs;
149 
150  // Derivative for bias
151  // Aggregate d(L) / d(z) in each output channel
152  ConstAlignedMapMat dLz_by_channel(dLz.data(), m_dim.conv_rows * m_dim.conv_cols, m_dim.out_channels * nobs);
153  Vector dLb = dLz_by_channel.colwise().sum();
154  // Average over observations
155  ConstAlignedMapMat dLb_by_obs(dLb.data(), m_dim.out_channels, nobs);
156  m_db.noalias() = dLb_by_obs.rowwise().mean();
157 
158  // Compute d(L) / d_in = conv_full(d(L) / d(z), w_rotate)
159  m_din.resize(this->m_in_size, nobs);
160  internal::ConvDims conv_full_dim(m_dim.out_channels, m_dim.in_channels, m_dim.conv_rows, m_dim.conv_cols, m_dim.filter_rows, m_dim.filter_cols);
161  internal::convolve_full(conv_full_dim, dLz.data(), nobs,
162  m_filter_data.data(), m_din.data()
163  );
164  }
165 
166  const Matrix& backprop_data() const
167  {
168  return m_din;
169  }
170 
171  void update(Optimizer& opt)
172  {
173  ConstAlignedMapVec dw(m_df_data.data(), m_df_data.size());
174  ConstAlignedMapVec db(m_db.data(), m_db.size());
175  AlignedMapVec w(m_filter_data.data(), m_filter_data.size());
176  AlignedMapVec b(m_bias.data(), m_bias.size());
177 
178  opt.update(dw, w);
179  opt.update(db, b);
180  }
181 
182  std::vector<Scalar> get_parameters() const
183  {
184  std::vector<Scalar> res(m_filter_data.size() + m_bias.size());
185  // Copy the data of filters and bias to a long vector
186  std::copy(m_filter_data.data(), m_filter_data.data() + m_filter_data.size(), res.begin());
187  std::copy(m_bias.data(), m_bias.data() + m_bias.size(), res.begin() + m_filter_data.size());
188 
189  return res;
190  }
191 
192  void set_parameters(const std::vector<Scalar>& param)
193  {
194  if(static_cast<int>(param.size()) != m_filter_data.size() + m_bias.size())
195  throw std::invalid_argument("Parameter size does not match");
196 
197  std::copy(param.begin(), param.begin() + m_filter_data.size(), m_filter_data.data());
198  std::copy(param.begin() + m_filter_data.size(), param.end(), m_bias.data());
199  }
200 
201  std::vector<Scalar> get_derivatives() const
202  {
203  std::vector<Scalar> res(m_df_data.size() + m_db.size());
204  // Copy the data of filters and bias to a long vector
205  std::copy(m_df_data.data(), m_df_data.data() + m_df_data.size(), res.begin());
206  std::copy(m_db.data(), m_db.data() + m_db.size(), res.begin() + m_df_data.size());
207 
208  return res;
209  }
210 };
211 
212 
213 } // namespace MiniDNN
214 
215 
216 #endif /* LAYER_CONVOLUTIONAL_H_ */
void init(const Scalar &mu, const Scalar &sigma, RNG &rng)
Definition: Convolutional.h:67
std::vector< Scalar > get_parameters() const
void set_parameters(const std::vector< Scalar > &param)
const Matrix & output() const
void update(Optimizer &opt)
Convolutional(const int in_width, const int in_height, const int in_channels, const int out_channels, const int window_width, const int window_height)
Definition: Convolutional.h:59
std::vector< Scalar > get_derivatives() const
void backprop(const Matrix &prev_layer_data, const Matrix &next_layer_data)
const Matrix & backprop_data() const
virtual void update(ConstAlignedMapVec &dvec, AlignedMapVec &vec)=0
void forward(const Matrix &prev_layer_data)
Definition: Convolutional.h:84