AshanGimhana's picture
Upload folder using huggingface_hub
9375c9a verified
raw
history blame
12.4 kB
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNn_SOLVERS_H_
#define DLIB_DNn_SOLVERS_H_
#include "solvers_abstract.h"
#include "../cuda/tensor.h"
#include <iostream>
#include "layers.h"
namespace dlib
{
class sgd
{
public:
explicit sgd(
float weight_decay_,
float momentum_ = 0.9
)
{
weight_decay = weight_decay_;
momentum = momentum_;
}
sgd(
) : sgd(0.0005, 0.9)
{
}
float get_momentum (
) const { return momentum; }
float get_weight_decay (
) const { return weight_decay; }
template <typename layer_type>
const tensor& operator() (
const float learning_rate,
const layer_type& l,
const tensor& params_grad
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0);
if (v.size() == 0)
{
v.copy_size(params_grad);
v = 0;
}
const double lr = learning_rate*get_learning_rate_multiplier(l);
const double wd = weight_decay*get_weight_decay_multiplier(l);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
return v;
}
template <unsigned long N>
const tensor& operator() (
const float learning_rate,
const fc_<N,FC_HAS_BIAS>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs());
return v;
}
template <
long _num_filters,
long _nr,
long _nc,
int _stride_y,
int _stride_x,
int _padding_y,
int _padding_x
>
const tensor& operator() (
const float learning_rate,
const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
return v;
}
template <
long _num_filters,
long _nr,
long _nc,
int _stride_y,
int _stride_x,
int _padding_y,
int _padding_x
>
const tensor& operator() (
const float learning_rate,
const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
return v;
}
template < layer_mode mode >
const tensor& operator() (
const float learning_rate,
const bn_<mode>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
return v;
}
friend void serialize(const sgd& item, std::ostream& out)
{
serialize("sgd2", out);
serialize(item.v, out);
serialize(item.weight_decay, out);
serialize(item.momentum, out);
}
friend void deserialize(sgd& item, std::istream& in)
{
std::string version;
deserialize(version, in);
if (version != "sgd2")
throw serialization_error("Unexpected version found while deserializing dlib::sgd.");
deserialize(item.v, in);
deserialize(item.weight_decay, in);
deserialize(item.momentum, in);
}
friend std::ostream& operator<< (std::ostream& out, const sgd& item)
{
out << "sgd: weight_decay="<<item.get_weight_decay() << ", momentum="<<item.get_momentum();
return out;
}
private:
template <typename layer_type>
void update_considering_bias(
const float learning_rate,
const layer_type& l,
const tensor& params_grad,
unsigned long bias_offset
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0);
if (v.size() == 0)
{
v.copy_size(params_grad);
v = 0;
}
double lr = learning_rate*get_learning_rate_multiplier(l);
double wd = weight_decay*get_weight_decay_multiplier(l);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
{
tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
}
else
{
tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr);
// now update the biases but apply their multipliers
lr *= l.get_bias_learning_rate_multiplier();
wd *= l.get_bias_weight_decay_multiplier();
tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr);
}
}
resizable_tensor v;
float weight_decay;
float momentum;
};
// ----------------------------------------------------------------------------------------
class adam
{
public:
adam(
float weight_decay_,
float momentum1_,
float momentum2_
)
{
weight_decay = weight_decay_;
momentum1 = momentum1_;
momentum2 = momentum2_;
t = 0;
}
adam(
) : adam(0.0005, 0.9, 0.999)
{}
float get_momentum1 (
) const { return momentum1; }
float get_momentum2 (
) const { return momentum2; }
float get_weight_decay (
) const { return weight_decay; }
template <typename layer_type>
const tensor& operator() (
const float learning_rate,
const layer_type& l,
const tensor& params_grad
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0);
if (v.size() == 0)
{
m.copy_size(params_grad);
m = 0;
v.copy_size(params_grad);
v = 0;
s.copy_size(params_grad);
}
++t;
tt::compute_adam_update(0, params.size(), s, m, v, t,
learning_rate*get_learning_rate_multiplier(l),
weight_decay*get_weight_decay_multiplier(l),
momentum1, momentum2, params, params_grad);
return s;
}
template <unsigned long N>
const tensor& operator() (
const float learning_rate,
const fc_<N,FC_HAS_BIAS>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs());
return s;
}
template <
long _num_filters,
long _nr,
long _nc,
int _stride_y,
int _stride_x,
int _padding_y,
int _padding_x
>
const tensor& operator() (
const float learning_rate,
const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
return s;
}
template <
long _num_filters,
long _nr,
long _nc,
int _stride_y,
int _stride_x,
int _padding_y,
int _padding_x
>
const tensor& operator() (
const float learning_rate,
const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
return s;
}
template < layer_mode mode >
const tensor& operator() (
const float learning_rate,
const bn_<mode>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
return s;
}
friend void serialize(const adam& item, std::ostream& out)
{
serialize("adam2", out);
serialize(item.m, out);
serialize(item.v, out);
serialize(item.s, out);
serialize(item.weight_decay, out);
serialize(item.momentum1, out);
serialize(item.momentum2, out);
serialize(item.t, out);
}
friend void deserialize(adam& item, std::istream& in)
{
std::string version;
deserialize(version, in);
if (version != "adam2")
throw serialization_error("Unexpected version found while deserializing dlib::adam.");
deserialize(item.m, in);
deserialize(item.v, in);
deserialize(item.s, in);
deserialize(item.weight_decay, in);
deserialize(item.momentum1, in);
deserialize(item.momentum2, in);
deserialize(item.t, in);
}
friend std::ostream& operator<< (std::ostream& out, const adam& item)
{
out << "adam: weight_decay="<<item.get_weight_decay() << ", momentum1="<<item.get_momentum1() << ", momentum2="<<item.get_momentum2();
return out;
}
private:
template <typename layer_type>
void update_considering_bias(
const float learning_rate,
const layer_type& l,
const tensor& params_grad,
unsigned long bias_offset
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0);
if (v.size() == 0)
{
m.copy_size(params_grad);
m = 0;
v.copy_size(params_grad);
v = 0;
s.copy_size(params_grad);
}
++t;
if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
{
tt::compute_adam_update(0, params.size(), s, m, v, t,
learning_rate*get_learning_rate_multiplier(l),
weight_decay*get_weight_decay_multiplier(l),
momentum1, momentum2, params, params_grad);
}
else
{
tt::compute_adam_update(0, bias_offset, s, m, v, t,
learning_rate*get_learning_rate_multiplier(l),
weight_decay*get_weight_decay_multiplier(l),
momentum1, momentum2, params, params_grad);
tt::compute_adam_update(bias_offset, params.size(), s, m, v, t,
learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(),
weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(),
momentum1, momentum2, params, params_grad);
}
}
resizable_tensor m;
resizable_tensor v;
resizable_tensor s;
float weight_decay;
float momentum1;
float momentum2;
float t;
};
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_DNn_SOLVERS_H_