// Copyright (C) 2015 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_DNn_TRAINER_H_ #define DLIB_DNn_TRAINER_H_ #include "trainer_abstract.h" #include "core.h" #include "solvers.h" #include "../statistics.h" #include #include #include #include "../serialize.h" #include "../pipe.h" #include "../threads.h" #include "../cuda/cuda_dlib.h" #include "../statistics/running_gradient.h" #include #include #include #include #include #include #include "../dir_nav.h" #include "../md5.h" namespace dlib { // ---------------------------------------------------------------------------------------- namespace impl { template struct dnn_job_t { dnn_job_t() = default; dnn_job_t(const dnn_job_t&) = delete; dnn_job_t& operator=(const dnn_job_t&) = delete; std::vector> labels; std::vector t; std::vector have_data; // have_data[i] is true if there is data in labels[i] and t[i]. bool test_only = false; }; template void swap(dnn_job_t& a, dnn_job_t& b) { a.labels.swap(b.labels); a.t.swap(b.t); a.have_data.swap(b.have_data); std::swap(a.test_only,b.test_only); } } enum class force_flush_to_disk { no = 0, yes = 1 }; template < typename net_type, typename solver_type = sgd > class dnn_trainer : private threaded_object { public: static_assert(is_loss_layer_type::value, "The last layer in a network must be a loss layer."); typedef typename net_type::training_label_type training_label_type; typedef typename net_type::input_type input_type; const static size_t num_computational_layers = net_type::num_computational_layers; const static size_t num_layers = net_type::num_layers; using threads = std::vector>; private: typedef impl::dnn_job_t job_t; public: dnn_trainer() = delete; dnn_trainer(const dnn_trainer&) = delete; dnn_trainer& operator=(const dnn_trainer&) = delete; explicit dnn_trainer(net_type& net_) : job_pipe(0), net(net_) { solver_type default_solver; devices.push_back(std::make_shared(dlib::cuda::get_device(), net, default_solver)); init(); } dnn_trainer( net_type& net_, const solver_type& solver_ ) : job_pipe(0), net(net_) { devices.push_back(std::make_shared(dlib::cuda::get_device(), net, solver_)); init(); } dnn_trainer( net_type& net_, const solver_type& solver_, const std::vector& cuda_extra_devices, std::shared_ptr thread_pools_ = std::shared_ptr() ) : job_pipe(0), thread_pools(thread_pools_), net(net_) { devices.push_back(std::make_shared(dlib::cuda::get_device(), net, solver_)); const int total_devices = dlib::cuda::get_num_devices(); // Make device contexts for the extra device ids but be careful to avoid any // duplicate ids. std::set temp(cuda_extra_devices.begin(), cuda_extra_devices.end()); temp.erase(devices[0]->device_id); for (auto id : temp) { DLIB_CASSERT(0 <= id && id < total_devices, "Invalid CUDA device id given to dnn_trainer."); // Switch to this device so that any tensor objects that get allocated when // we create the device context happen on this device. dlib::cuda::set_device(id); devices.push_back(std::make_shared(id, net, solver_, clone_net())); } // Set the current device back to what it was before this constructor was // called. dlib::cuda::set_device(devices[0]->device_id); init(); } ~dnn_trainer( ) { job_pipe.disable(); stop(); wait(); } net_type& get_net ( force_flush_to_disk force_flush = force_flush_to_disk::yes ) { wait_for_thread_to_pause(); sync_to_disk(force_flush == force_flush_to_disk::yes); propagate_exception(); return net; } unsigned long get_mini_batch_size ( ) const { return mini_batch_size; } void set_mini_batch_size ( unsigned long batch_size ) { DLIB_CASSERT(batch_size > 0); mini_batch_size = batch_size; } unsigned long get_max_num_epochs ( ) const { return max_num_epochs; } void set_max_num_epochs ( unsigned long num ) { DLIB_CASSERT(num > 0); max_num_epochs = num; } void be_verbose ( ) { verbose = true; } void be_quiet ( ) { verbose = false; } const std::vector& get_solvers ( ) const { wait_for_thread_to_pause(); propagate_exception(); return devices[0]->solvers; } void train_one_step ( const std::vector& data, const std::vector& labels ) { DLIB_CASSERT(data.size() == labels.size()); train_one_step(data.begin(), data.end(), labels.begin()); } template < typename data_iterator, typename label_iterator > void train_one_step ( data_iterator dbegin, data_iterator dend, label_iterator lbegin ) { DLIB_CASSERT(std::distance(dbegin, dend) > 0); print_periodic_verbose_status(); sync_to_disk(); send_job(false, dbegin, dend, lbegin); ++train_one_step_calls; } void train_one_step ( const std::vector& data ) { train_one_step(data.begin(), data.end()); } template < typename data_iterator > void train_one_step ( data_iterator dbegin, data_iterator dend ) { DLIB_CASSERT(std::distance(dbegin, dend) > 0); print_periodic_verbose_status(); sync_to_disk(); send_job(false, dbegin, dend); ++train_one_step_calls; } void test_one_step ( const std::vector& data, const std::vector& labels ) { DLIB_CASSERT(data.size() == labels.size()); test_one_step(data.begin(), data.end(), labels.begin()); } template < typename data_iterator, typename label_iterator > void test_one_step ( data_iterator dbegin, data_iterator dend, label_iterator lbegin ) { DLIB_CASSERT(std::distance(dbegin, dend) > 0); print_periodic_verbose_status(); sync_to_disk(); send_job(true, dbegin, dend, lbegin); ++test_one_step_calls; } void test_one_step ( const std::vector& data ) { test_one_step(data.begin(), data.end()); } template < typename data_iterator > void test_one_step ( data_iterator dbegin, data_iterator dend ) { DLIB_CASSERT(std::distance(dbegin, dend) > 0); print_periodic_verbose_status(); sync_to_disk(); send_job(true, dbegin, dend); ++test_one_step_calls; } void train ( const std::vector& data, const std::vector& labels ) { DLIB_CASSERT(data.size() == labels.size() && data.size() > 0); // The reason these two loops don't initialize their counter variables but // instead use class members is so we can include the state of the loops in the // stuff written by sync_to_disk() for (; epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate; ++epoch_iteration) { using namespace std::chrono; last_time = system_clock::now(); clear_average_loss(); for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size) { if (verbose) { auto now_time = system_clock::now(); if (now_time-last_time > seconds(20)) { last_time = now_time; auto iter = epoch_iteration + epoch_pos/(double)data.size(); std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " " << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " " << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; print_progress(); } } sync_to_disk(); send_job(false, data.begin()+epoch_pos, data.begin()+std::min(epoch_pos+mini_batch_size,data.size()), labels.begin()+epoch_pos); } epoch_pos = 0; if (verbose) { // Capitalize the E in Epoch so it's easy to grep out the lines that // are for full epoch status statements. std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " " << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " " << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; print_progress(); } } wait_for_thread_to_pause(); // if we modified the network at all then be sure to sync the final result. sync_to_disk(true); } void train ( const std::vector& data ) { DLIB_CASSERT(data.size() > 0); const bool has_unsupervised_loss = std::is_same::value; static_assert(has_unsupervised_loss, "You can only call this version of train() when using an unsupervised loss."); // The reason these two loops don't initialize their counter variables but // instead use class members is so we can include the state of the loops in the // stuff written by sync_to_disk() for (; epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate; ++epoch_iteration) { using namespace std::chrono; last_time = system_clock::now(); clear_average_loss(); for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size) { if (verbose) { auto now_time = system_clock::now(); if (now_time-last_time > seconds(20)) { last_time = now_time; auto iter = epoch_iteration + epoch_pos/(double)data.size(); std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " " << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " " << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; print_progress(); } } sync_to_disk(); send_job(false, data.begin()+epoch_pos, data.begin()+std::min(epoch_pos+mini_batch_size,data.size())); } epoch_pos = 0; if (verbose) { // Capitalize the E in Epoch so it's easy to grep out the lines that // are for full epoch status statements. std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " " << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " " << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; print_progress(); } } wait_for_thread_to_pause(); // if we modified the network at all then be sure to sync the final result. sync_to_disk(true); } void set_synchronization_file ( const std::string& filename, std::chrono::seconds time_between_syncs_ = std::chrono::minutes(15) ) { last_sync_time = std::chrono::system_clock::now(); sync_filename = filename; time_between_syncs = time_between_syncs_; // check if the sync file already exists, if it does we should load it. std::ifstream fin(newest_syncfile(), std::ios::binary); if (fin) deserialize(*this, fin); } const std::string& get_synchronization_file ( ) { return sync_filename; } double get_average_loss ( ) const { wait_for_thread_to_pause(); return rs.mean(); } double get_average_test_loss ( ) const { wait_for_thread_to_pause(); return rs_test.mean(); } void clear_average_loss ( ) { wait_for_thread_to_pause(); rs.clear(); } void set_learning_rate ( double lr ) { DLIB_CASSERT(lr > 0); wait_for_thread_to_pause(); if (learning_rate != lr) { steps_without_progress = 0; test_steps_without_progress = 0; previous_loss_values.clear(); test_previous_loss_values.clear(); previous_loss_values_to_keep_until_disk_sync.clear(); } learning_rate = lr; lr_schedule.set_size(0); } double get_learning_rate( ) const { return learning_rate; } void set_min_learning_rate ( double lr ) { DLIB_CASSERT(lr > 0); wait_for_thread_to_pause(); lr_schedule.set_size(0); min_learning_rate = lr; } double get_min_learning_rate ( ) const { return min_learning_rate; } template void set_learning_rate_schedule ( const matrix_exp& schedule ) { DLIB_CASSERT(schedule.size() > 0); DLIB_CASSERT(min(schedule) > 0); set_learning_rate(schedule(0,0)); set_min_learning_rate(min(schedule)); set_learning_rate_shrink_factor(1); lr_schedule = matrix_cast(reshape_to_column_vector(schedule)); lr_schedule_pos = 0; } const matrix& get_learning_rate_schedule ( ) const { return lr_schedule; } void set_iterations_without_progress_threshold ( unsigned long thresh ) { wait_for_thread_to_pause(); lr_schedule.set_size(0); iter_without_progress_thresh = thresh; } unsigned long get_iterations_without_progress_threshold ( ) const { return iter_without_progress_thresh; } unsigned long get_steps_without_progress ( ) const { return steps_without_progress; } void set_test_iterations_without_progress_threshold ( unsigned long thresh ) { wait_for_thread_to_pause(); lr_schedule.set_size(0); test_iter_without_progress_thresh = thresh; } unsigned long get_test_iterations_without_progress_threshold ( ) const { return test_iter_without_progress_thresh; } unsigned long get_test_steps_without_progress ( ) const { return test_steps_without_progress; } void set_learning_rate_shrink_factor ( double shrink ) { DLIB_CASSERT(0 < shrink && shrink <= 1); wait_for_thread_to_pause(); lr_schedule.set_size(0); learning_rate_shrink = shrink; steps_without_progress = 0; test_steps_without_progress = 0; } double get_learning_rate_shrink_factor ( ) const { return learning_rate_shrink; } unsigned long long get_train_one_step_calls ( ) const { return train_one_step_calls; } unsigned long long get_test_one_step_calls ( ) const { return test_one_step_calls; } private: void record_test_loss(double loss) { test_previous_loss_values.push_back(loss); if (is_finite(loss)) rs_test.add(loss); // discard really old loss values. while (test_previous_loss_values.size() > test_iter_without_progress_thresh) test_previous_loss_values.pop_front(); } void record_loss(double loss) { // This kind of budgeting causes our gradient checking to use a fixed amount of // computational resources, regardless of the size of iter_without_progress_thresh. gradient_check_budget += 200; rs.add(loss); previous_loss_values.push_back(loss); // discard really old loss values. while (previous_loss_values.size() > iter_without_progress_thresh) previous_loss_values.pop_front(); // separately keep another loss history until disk sync // (but only if disk sync is enabled) if (!sync_filename.empty()) previous_loss_values_to_keep_until_disk_sync.push_back(loss); } template double compute_parameter_gradients(size_t device, job_t& next_job, const T&) { if (next_job.have_data[device]) { auto&& dev = *devices[device]; dlib::cuda::set_device(dev.device_id); if (next_job.test_only) return dev.net.compute_loss(next_job.t[device], next_job.labels[device].begin()); else return dev.net.compute_parameter_gradients(next_job.t[device], next_job.labels[device].begin()); } else { return 0; } } double compute_parameter_gradients(size_t device, job_t& next_job, const no_label_type&) { if (next_job.have_data[device]) { auto&& dev = *devices[device]; dlib::cuda::set_device(dev.device_id); no_label_type pick_which_run_update; if (next_job.test_only) return dev.net.compute_loss(next_job.t[device]); else return dev.net.compute_parameter_gradients(next_job.t[device]); } else { return 0; } } void update_parameters(size_t device) { auto&& dev = *devices[device]; dlib::cuda::set_device(dev.device_id); dev.net.update_parameters(make_sstack(dev.solvers), learning_rate); } void thread() try { training_label_type pick_which_run_update; job_t next_job; std::vector> losses(devices.size()); std::vector averagers; // An array of all the parameter tensors in the first network. We will // periodically copy these tensors to all the other devices to make sure the // different GPUs don't go out of sync. std::vector reference_params; visit_layer_parameters(devices[0]->net, [&](tensor& t) { reference_params.push_back(&t); }); // If no external thread pools vector was passed, then create one that will // be automatically destructed as soon as the dnn_trainer object goes out of // scope. if (!thread_pools) thread_pools = std::make_shared(); auto& tp = *thread_pools; // We make separate thread pools with just one thread in them because we want // to make sure each device is always executed on the same thread. We care // about this because there are thread_local context variables for some cuda // components and they get allocated for each combination of thread and device. // So if we make sure the same device always uses the same thread this will // reduce the number of contexts we allocate from num_devices*num_devices to // just num_devices. while (tp.size() < devices.size()) tp.push_back(std::make_shared(1)); main_iteration_counter = 0; while(job_pipe.dequeue(next_job)) { if (next_job.test_only) { // compute the testing loss for (size_t i = 0; i < devices.size(); ++i) tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]); // aggregate loss values from all the network computations. double theloss = 0; for (auto&& loss : losses) theloss += loss.get(); record_test_loss(theloss/losses.size()); // Check if we should shrink the learning rate based on how the test // error has been doing lately. if (learning_rate_shrink != 1) { test_steps_without_progress = count_steps_without_decrease(test_previous_loss_values); if (test_steps_without_progress >= test_iter_without_progress_thresh) { test_steps_without_progress = count_steps_without_decrease_robust(test_previous_loss_values); if (test_steps_without_progress >= test_iter_without_progress_thresh) { // optimization has flattened out, so drop the learning rate. learning_rate = learning_rate_shrink*learning_rate; test_steps_without_progress = 0; // Empty out some of the previous loss values so that test_steps_without_progress // will decrease below test_iter_without_progress_thresh. drop_some_test_previous_loss_values(); } } } continue; } updated_net_since_last_sync = true; ++main_iteration_counter; // Call compute_parameter_gradients() and update_parameters() but pick the // right version for unsupervised or supervised training based on the type // of training_label_type. for (size_t i = 0; i < devices.size(); ++i) tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]); // aggregate loss values from all the network computations. double theloss = 0; for (auto&& loss : losses) theloss += loss.get(); record_loss(theloss/losses.size()); // Now, if there is more than one active device we need to synchronize the // gradient updates between devices. So we do that now. if (devices.size() > 1) { // if this is the first iteration then we need to setup the averagers. // We can't do this outside the loop because the tensors that get // averaged need to be allocated to their devices before we call set() // so that the averagers can determine how best to average them. if (averagers.size() == 0 || sync_file_reloaded) { averagers = std::vector(net_type::num_computational_layers); // setup the averagers to point to the tensors in the networks. std::vector> all_tensors(devices.size()); for (size_t i = 0; i < all_tensors.size(); ++i) { all_tensors[i].resize(net_type::num_computational_layers); visit_layer_parameter_gradients(devices[i]->net, [&](size_t j, tensor& t){ all_tensors[i][j] = &t; }); } // Now set each averager to average the tensors at the same layer in each // network. for (size_t i = 0; i < net_type::num_computational_layers; ++i) { std::vector temp(all_tensors.size()); for (size_t j = 0; j < all_tensors.size(); ++j) { temp[j] = all_tensors[j][i]; DLIB_CASSERT(temp[0]->size() == temp[j]->size(), "Make sure you don't modify the network structure " "or number of parameters after constructing the trainer."); } // ignore layers that don't have parameters if (temp[0]->size() != 0) averagers[i].set(temp); } sync_file_reloaded = false; } for (auto&& d : devices) cuda::device_synchronize(d->device_id); for (auto&& avg : averagers) avg.average(); } // Now apply all the updates to each device. for (size_t i = 0; i < devices.size(); ++i) tp[i]->add_task_by_value([&,i](){ if (next_job.have_data[i]) update_parameters(i); }); // and wait for the updates to all happen. for (size_t i = 0; i < devices.size(); ++i) tp[i]->wait_for_all_tasks(); // Every now and then force all the parameters to be the same just to make // sure they aren't drifting apart due to any non-deterministic behavior on // the GPU. It's also important to do this on the first iteration because // the different networks may be initialized differently when tensor data // is first passed through them. So this code block deals with these // issues. if (devices.size() > 1 && main_iteration_counter%2000 == 1) { for (size_t i = 1; i < devices.size(); ++i) { visit_layer_parameters(devices[i]->net, [&](size_t j, tensor& t) { memcpy(t, *reference_params[j]); }); } } // If we have been running for a while then check if the loss is still // dropping. If it isn't then we will reduce the learning rate. Note that we // have a "budget" that prevents us from calling // count_steps_without_decrease() every iteration. We do this because // it can be expensive to compute when previous_loss_values is large. if (gradient_check_budget > iter_without_progress_thresh && learning_rate_shrink != 1) { gradient_check_budget = 0; steps_without_progress = count_steps_without_decrease(previous_loss_values); if (steps_without_progress >= iter_without_progress_thresh) { // Double check that we aren't seeing decrease. This second check // discards the top 10% largest values and checks again. We do // this because sometimes a mini-batch might be bad and cause the // loss to suddenly jump up, making count_steps_without_decrease() // return a large number. But if we discard the top 10% of the // values in previous_loss_values then we are robust to that kind // of noise. Another way of looking at it, if the reason // count_steps_without_decrease() returns a large value is only // because the most recent loss values have suddenly been large, // then we shouldn't stop or lower the learning rate. We should // keep going until whatever disturbance we hit is damped down. steps_without_progress = count_steps_without_decrease_robust(previous_loss_values); if (steps_without_progress >= iter_without_progress_thresh) { // optimization has flattened out, so drop the learning rate. learning_rate = learning_rate_shrink*learning_rate; steps_without_progress = 0; // Empty out some of the previous loss values so that steps_without_progress // will decrease below iter_without_progress_thresh. drop_some_previous_loss_values(); } } } else if (lr_schedule.size() != 0) // or use the learning rate schedule if we have one. { if (lr_schedule_pos < lr_schedule.size()) learning_rate = lr_schedule(lr_schedule_pos++); else learning_rate = lr_schedule(lr_schedule.size()-1)*0.99; } } } catch(...) { // If an exception happens then permanently disable the trainer object. job_pipe.disable(); std::lock_guard lock(eptr_mutex); eptr = std::current_exception(); } void wait_for_thread_to_pause() const { job_pipe.wait_for_num_blocked_dequeues(1); } const static long string_pad = 11; const static long epoch_string_pad = 4; const static long lr_string_pad = 4; void init() { max_num_epochs = 10000; mini_batch_size = 128; verbose = false; learning_rate = 1e-2; min_learning_rate = 1e-5; iter_without_progress_thresh = 2000; steps_without_progress = 0; test_iter_without_progress_thresh = 500; test_steps_without_progress = 0; learning_rate_shrink = 0.1; epoch_iteration = 0; epoch_pos = 0; train_one_step_calls = 0; test_one_step_calls = 0; gradient_check_budget = 0; lr_schedule_pos = 0; main_iteration_counter = 0; main_iteration_counter_at_last_disk_sync = 0; prob_loss_increasing_thresh_default_value = 0.99; prob_loss_increasing_thresh_max_value = 0.99999; prob_loss_increasing_thresh = prob_loss_increasing_thresh_default_value; updated_net_since_last_sync = false; sync_file_reloaded = false; previous_loss_values_dump_amount = 400; test_previous_loss_values_dump_amount = 100; rs_test = running_stats_decayed(200); start(); } // serialize and deserialize are private because we hold net by reference so // allowing someone to serialize this training object is weird and will likely // result in user errors. However, we use these functions as part of the automatic // sync code in this object. friend void serialize(const dnn_trainer& item, std::ostream& out) { item.wait_for_thread_to_pause(); int version = 13; serialize(version, out); size_t nl = dnn_trainer::num_layers; serialize(nl, out); serialize(item.rs, out); serialize(item.rs_test, out); serialize(item.previous_loss_values, out); serialize(item.max_num_epochs, out); serialize(item.mini_batch_size, out); serialize(item.verbose, out); serialize(item.net, out); serialize(item.devices[0]->solvers, out); serialize(item.learning_rate.load(), out); serialize(item.min_learning_rate, out); serialize(item.iter_without_progress_thresh.load(), out); serialize(item.steps_without_progress.load(), out); serialize(item.learning_rate_shrink.load(), out); serialize(item.epoch_iteration, out); serialize(item.epoch_pos, out); serialize(item.train_one_step_calls, out); serialize(item.test_one_step_calls, out); serialize(item.lr_schedule, out); serialize(item.lr_schedule_pos, out); serialize(item.test_iter_without_progress_thresh.load(), out); serialize(item.test_steps_without_progress.load(), out); serialize(item.test_previous_loss_values, out); serialize(item.previous_loss_values_dump_amount, out); serialize(item.test_previous_loss_values_dump_amount, out); serialize(item.previous_loss_values_to_keep_until_disk_sync, out); } friend void deserialize(dnn_trainer& item, std::istream& in) { item.wait_for_thread_to_pause(); int version = 0; deserialize(version, in); if (version != 13) throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer."); size_t num_layers = 0; deserialize(num_layers, in); if (num_layers != dnn_trainer::num_layers) { std::ostringstream sout; sout << "Error deserializing dlib::dnn_trainer. The saved sync file is for a network with " << std::endl; sout << "a different number of layers. We expected the number of layers to be " << dnn_trainer::num_layers << " but" << std::endl; sout << "instead the file contains " << num_layers << " layers." << std::endl; throw serialization_error(sout.str()); } double dtemp; long ltemp; deserialize(item.rs, in); deserialize(item.rs_test, in); deserialize(item.previous_loss_values, in); deserialize(item.max_num_epochs, in); deserialize(item.mini_batch_size, in); deserialize(item.verbose, in); deserialize(item.net, in); deserialize(item.devices[0]->solvers, in); deserialize(dtemp, in); item.learning_rate = dtemp; deserialize(item.min_learning_rate, in); deserialize(ltemp, in); item.iter_without_progress_thresh = ltemp; deserialize(ltemp, in); item.steps_without_progress = ltemp; deserialize(dtemp, in); item.learning_rate_shrink = dtemp; deserialize(item.epoch_iteration, in); deserialize(item.epoch_pos, in); deserialize(item.train_one_step_calls, in); deserialize(item.test_one_step_calls, in); deserialize(item.lr_schedule, in); deserialize(item.lr_schedule_pos, in); deserialize(ltemp, in); item.test_iter_without_progress_thresh = ltemp; deserialize(ltemp, in); item.test_steps_without_progress = ltemp; deserialize(item.test_previous_loss_values, in); deserialize(item.previous_loss_values_dump_amount, in); deserialize(item.test_previous_loss_values_dump_amount, in); deserialize(item.previous_loss_values_to_keep_until_disk_sync, in); if (item.devices.size() > 1) { const auto prev_dev = dlib::cuda::get_device(); // initialize all the other device networks and solver objects for (size_t i = 1; i < item.devices.size(); ++i) { // Switch to this device so that any tensor objects that get allocated when // we copy this stuff happen on this device. dlib::cuda::set_device(item.devices[i]->device_id); item.devices[i]->solvers = item.devices[0]->solvers; item.devices[i]->net = item.devices[0]->net; } dlib::cuda::set_device(prev_dev); } } // Empty out some of the previous loss values so that steps_without_progress will decrease below iter_without_progress_thresh. void drop_some_previous_loss_values() { for (unsigned long cnt = 0; cnt < previous_loss_values_dump_amount + iter_without_progress_thresh / 10 && previous_loss_values.size() > 0; ++cnt) previous_loss_values.pop_front(); } // Empty out some of the previous test loss values so that test_steps_without_progress will decrease below test_iter_without_progress_thresh. void drop_some_test_previous_loss_values() { for (unsigned long cnt = 0; cnt < test_previous_loss_values_dump_amount + test_iter_without_progress_thresh / 10 && test_previous_loss_values.size() > 0; ++cnt) test_previous_loss_values.pop_front(); } void sync_to_disk ( bool do_it_now = false ) { // don't sync anything if we haven't updated the network since the last sync if (!updated_net_since_last_sync) return; // If the sync file isn't set then don't do anything. if (sync_filename.size() == 0) return; // Only sync if it has been long enough since the last sync or we are being // explicitly forced to do it. if (std::chrono::system_clock::now() - last_sync_time > time_between_syncs || do_it_now) { wait_for_thread_to_pause(); // compact network before saving to disk. this->net.clean(); // if the loss has actually been going up since the last time we saved our // state to disk then something has probably gone wrong in the // optimization. So in this case we do the opposite and recall the // previously saved state in the hopes that the problem won't reoccur. if (loss_increased_since_last_disk_sync()) { std::ifstream fin(newest_syncfile(), std::ios::binary); deserialize(*this, fin); sync_file_reloaded = true; if (verbose) std::cout << "Loss has been increasing, reloading saved state from " << newest_syncfile() << std::endl; // Are we repeatedly hitting our head against the wall? If so, then we // might be better off giving up at this learning rate, and trying a // lower one instead. if (prob_loss_increasing_thresh >= prob_loss_increasing_thresh_max_value) { if (verbose) std::cout << "(and while at it, also shrinking the learning rate)" << std::endl; learning_rate = learning_rate_shrink * learning_rate; steps_without_progress = 0; test_steps_without_progress = 0; drop_some_previous_loss_values(); drop_some_test_previous_loss_values(); } } else { const std::string filename = oldest_syncfile(); serialize(filename) << *this; if (verbose) std::cout << "Saved state to " << filename << std::endl; } last_sync_time = std::chrono::system_clock::now(); main_iteration_counter_at_last_disk_sync = main_iteration_counter; updated_net_since_last_sync = false; } } std::string newest_syncfile ( ) { return select_newest_file(sync_filename, sync_filename + "_"); } std::string oldest_syncfile ( ) { return select_oldest_file(sync_filename, sync_filename + "_"); } bool loss_increased_since_last_disk_sync() { size_t gradient_updates_since_last_sync = main_iteration_counter - main_iteration_counter_at_last_disk_sync; // if we haven't synced anything to disk yet then return false. if (!std::ifstream(newest_syncfile(), std::ios::binary)) return false; // Now look at the data since a little before the last disk sync. We will // check if the loss is getting better or worse. while (previous_loss_values_to_keep_until_disk_sync.size() > 2 * gradient_updates_since_last_sync) previous_loss_values_to_keep_until_disk_sync.pop_front(); // Always retry if there are any nan or inf values for (auto x : previous_loss_values_to_keep_until_disk_sync) { if (std::isnan(x) || std::isinf(x)) return true; } // if we haven't seen much data yet then just say false. if (gradient_updates_since_last_sync < 30) return false; // if the loss is very likely to be increasing then return true const double prob1 = probability_values_are_increasing(previous_loss_values_to_keep_until_disk_sync); const double prob2 = probability_values_are_increasing_robust(previous_loss_values_to_keep_until_disk_sync); if (std::max(prob1, prob2) > prob_loss_increasing_thresh) { // Exponentially decay the threshold towards 1 so that if we keep finding // the loss to be increasing over and over we will make the test // progressively harder and harder until it fails, therefore ensuring we // can't get stuck reloading from a previous state over and over. prob_loss_increasing_thresh = std::min( 0.1*prob_loss_increasing_thresh + 0.9*1, prob_loss_increasing_thresh_max_value ); return true; } else { // decay back to the default threshold prob_loss_increasing_thresh = std::pow(prob_loss_increasing_thresh, 10.0); // but don't decay below the default value prob_loss_increasing_thresh = std::max(prob_loss_increasing_thresh, prob_loss_increasing_thresh_default_value); return false; } } struct clone_net{}; // per device state. All the containers have the same number of objects in them. struct device_data { device_data( int device_id_, net_type& net_, const solver_type& solver_ ) : device_id(device_id_), net(net_), solvers(num_computational_layers, solver_) {} device_data( int device_id_, net_type& net_, const solver_type& solver_, clone_net ) : device_id(device_id_), net_copy(std::make_shared(net_)), net(*net_copy), solvers(num_computational_layers, solver_) {} int device_id; std::shared_ptr net_copy; net_type& net; std::vector solvers; }; template < typename data_iterator, typename label_iterator > void send_job ( bool test_only, data_iterator dbegin, data_iterator dend, label_iterator lbegin ) { propagate_exception(); size_t num = std::distance(dbegin, dend); size_t devs = devices.size(); job.t.resize(devs); job.labels.resize(devs); job.have_data.resize(devs); job.test_only = test_only; // chop the data into devs blocks, each of about block_size elements. const double block_size = num / static_cast(devs); const auto prev_dev = dlib::cuda::get_device(); double j = 0; for (size_t i = 0; i < devs; ++i) { dlib::cuda::set_device(devices[i]->device_id); const size_t start = static_cast(std::round(j)); const size_t stop = static_cast(std::round(j + block_size)); if (start < stop) { devices[i]->net.to_tensor(dbegin+start, dbegin+stop, job.t[i]); job.labels[i].assign(lbegin+start, lbegin+stop); job.have_data[i] = true; } else { job.have_data[i] = false; } j += block_size; } DLIB_ASSERT(std::fabs(j - num) < 1e-10); dlib::cuda::set_device(prev_dev); job_pipe.enqueue(job); } template < typename data_iterator > void send_job ( bool test_only, data_iterator dbegin, data_iterator dend ) { typename std::vector::iterator nothing; send_job(test_only, dbegin, dend, nothing); } void print_progress() { if (lr_schedule.size() == 0) { if (test_previous_loss_values.size() == 0) std::cout << "steps without apparent progress: " << steps_without_progress; else std::cout << "steps without apparent progress: train=" << steps_without_progress << ", test=" << test_steps_without_progress; } else { std::ostringstream sout; sout << "percent complete: " << std::fixed << std::setprecision(2) << 100.0*lr_schedule_pos/(double)lr_schedule.size() << "%"; std::cout << sout.str(); } std::cout << std::endl; } void print_periodic_verbose_status() { if (verbose) { using namespace std::chrono; auto now_time = system_clock::now(); if (now_time-last_time > seconds(40)) { last_time = now_time; std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " " << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "; if (test_previous_loss_values.size() == 0) { std::cout << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; } else { std::cout << "train loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; std::cout << "test loss: " << rpad(cast_to_string(get_average_test_loss()),string_pad) << " "; } print_progress(); clear_average_loss(); } } } std::vector> devices; dlib::pipe job_pipe; std::shared_ptr thread_pools; job_t job; running_stats rs; running_stats_decayed rs_test; std::deque previous_loss_values; unsigned long max_num_epochs; size_t mini_batch_size; bool verbose; net_type& net; std::atomic learning_rate; double min_learning_rate; std::atomic iter_without_progress_thresh; std::atomic steps_without_progress; std::atomic test_iter_without_progress_thresh; std::atomic test_steps_without_progress; std::deque test_previous_loss_values; std::deque previous_loss_values_to_keep_until_disk_sync; std::atomic learning_rate_shrink; std::chrono::time_point last_sync_time; std::string sync_filename; std::chrono::seconds time_between_syncs; unsigned long epoch_iteration; size_t epoch_pos; std::chrono::time_point last_time; unsigned long long train_one_step_calls; unsigned long long test_one_step_calls; matrix lr_schedule; long lr_schedule_pos; unsigned long gradient_check_budget; std::exception_ptr eptr = nullptr; mutable std::mutex eptr_mutex; void propagate_exception() const { std::lock_guard lock(eptr_mutex); if (eptr) std::rethrow_exception(eptr); } // These 5 variables are not serialized size_t main_iteration_counter; size_t main_iteration_counter_at_last_disk_sync; double prob_loss_increasing_thresh_default_value; double prob_loss_increasing_thresh_max_value; double prob_loss_increasing_thresh; std::atomic updated_net_since_last_sync; bool sync_file_reloaded; unsigned long previous_loss_values_dump_amount; unsigned long test_previous_loss_values_dump_amount; }; // ---------------------------------------------------------------------------------------- template < typename net_type, typename solver_type > std::ostream& operator<< ( std::ostream& out, dnn_trainer& trainer ) { using std::endl; out << "dnn_trainer details: \n"; out << " net_type::num_layers: " << net_type::num_layers << endl; // figure out how big the net is in MB. std::ostringstream sout; net_type temp = trainer.get_net(); // make a copy so that we can clean it without mutating the trainer's net. temp.clean(); serialize(temp, sout); out << " net size: " << sout.str().size()/1024.0/1024.0 << " MiB" << endl; // Don't include the loss params in the hash since we print them on the next line. // They also aren't really part of the "architecture" of the network. out << " net architecture hash: " << md5(cast_to_string(trainer.get_net().subnet())) << endl; out << " loss: " << trainer.get_net().loss_details() << endl; out << " get_train_one_step_calls(): " << trainer.get_train_one_step_calls() << endl; out << " synchronization file: " << trainer.get_synchronization_file() << endl; out << " trainer.get_solvers()[0]: " << trainer.get_solvers()[0] << endl; out << " mini batch size: " << trainer.get_mini_batch_size() << endl; auto sched = trainer.get_learning_rate_schedule(); if (sched.size() != 0) { out << " using explicit user-supplied learning rate schedule" << endl; } else { out << " learning rate: "<< trainer.get_learning_rate() << endl; out << " learning rate shrink factor: "<< trainer.get_learning_rate_shrink_factor() << endl; out << " min learning rate: "<< trainer.get_min_learning_rate() << endl; out << " iterations without progress threshold: "<< trainer.get_iterations_without_progress_threshold() << endl; out << " test iterations without progress threshold: "<< trainer.get_test_iterations_without_progress_threshold() << endl; } return out; } // ---------------------------------------------------------------------------------------- } #endif // DLIB_DNn_TRAINER_H_