Spaces:

AshanGimhana
/

Aging_MouthReplace

Paused

App Files Files Community

Aging_MouthReplace / dlibs /dlib /dnn /trainer.h

AshanGimhana

Upload folder using huggingface_hub

9375c9a verified 3 months ago

raw

history blame

55.7 kB

	// Copyright (C) 2015 Davis E. King (davis@dlib.net)
	// License: Boost Software License See LICENSE.txt for the full license.
	#ifndef DLIB_DNn_TRAINER_H_
	#define DLIB_DNn_TRAINER_H_

	#include "trainer_abstract.h"
	#include "core.h"
	#include "solvers.h"
	#include "../statistics.h"
	#include <chrono>
	#include <fstream>
	#include <sstream>
	#include "../serialize.h"

	#include "../pipe.h"
	#include "../threads.h"
	#include "../cuda/cuda_dlib.h"
	#include "../statistics/running_gradient.h"
	#include <atomic>
	#include <cstdio>
	#include <set>
	#include <future>
	#include <exception>
	#include <mutex>
	#include "../dir_nav.h"
	#include "../md5.h"

	namespace dlib
	{

	// ----------------------------------------------------------------------------------------

	namespace impl
	{
	template <typename training_label_type>
	struct dnn_job_t
	{
	dnn_job_t() = default;
	dnn_job_t(const dnn_job_t&) = delete;
	dnn_job_t& operator=(const dnn_job_t&) = delete;

	std::vector<std::vector<training_label_type>> labels;
	std::vector<resizable_tensor> t;
	std::vector<int> have_data; // have_data[i] is true if there is data in labels[i] and t[i].
	bool test_only = false;
	};

	template <typename training_label_type>
	void swap(dnn_job_t<training_label_type>& a, dnn_job_t<training_label_type>& b)
	{
	a.labels.swap(b.labels);
	a.t.swap(b.t);
	a.have_data.swap(b.have_data);
	std::swap(a.test_only,b.test_only);
	}
	}

	enum class force_flush_to_disk {
	no = 0,
	yes = 1
	};

	template <
	typename net_type,
	typename solver_type = sgd
	>
	class dnn_trainer : private threaded_object
	{
	public:

	static_assert(is_loss_layer_type<net_type>::value,
	"The last layer in a network must be a loss layer.");

	typedef typename net_type::training_label_type training_label_type;
	typedef typename net_type::input_type input_type;
	const static size_t num_computational_layers = net_type::num_computational_layers;
	const static size_t num_layers = net_type::num_layers;
	using threads = std::vector<std::shared_ptr<thread_pool>>;
	private:
	typedef impl::dnn_job_t<training_label_type> job_t;
	public:

	dnn_trainer() = delete;
	dnn_trainer(const dnn_trainer&) = delete;
	dnn_trainer& operator=(const dnn_trainer&) = delete;

	explicit dnn_trainer(net_type& net_) : job_pipe(0), net(net_)
	{
	solver_type default_solver;
	devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, default_solver));

	init();
	}

	dnn_trainer(
	net_type& net_,
	const solver_type& solver_
	) : job_pipe(0), net(net_)
	{
	devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));

	init();
	}

	dnn_trainer(
	net_type& net_,
	const solver_type& solver_,
	const std::vector<int>& cuda_extra_devices,
	std::shared_ptr<threads> thread_pools_ = std::shared_ptr<threads>()
	) : job_pipe(0), thread_pools(thread_pools_), net(net_)
	{
	devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));

	const int total_devices = dlib::cuda::get_num_devices();

	// Make device contexts for the extra device ids but be careful to avoid any
	// duplicate ids.
	std::set<int> temp(cuda_extra_devices.begin(), cuda_extra_devices.end());
	temp.erase(devices[0]->device_id);
	for (auto id : temp)
	{
	DLIB_CASSERT(0 <= id && id < total_devices, "Invalid CUDA device id given to dnn_trainer.");
	// Switch to this device so that any tensor objects that get allocated when
	// we create the device context happen on this device.
	dlib::cuda::set_device(id);
	devices.push_back(std::make_shared<device_data>(id, net, solver_, clone_net()));
	}
	// Set the current device back to what it was before this constructor was
	// called.
	dlib::cuda::set_device(devices[0]->device_id);

	init();
	}

	~dnn_trainer(
	)
	{
	job_pipe.disable();
	stop();
	wait();
	}

	net_type& get_net (
	force_flush_to_disk force_flush = force_flush_to_disk::yes
	)
	{
	wait_for_thread_to_pause();
	sync_to_disk(force_flush == force_flush_to_disk::yes);
	propagate_exception();
	return net;
	}


	unsigned long get_mini_batch_size (
	) const { return mini_batch_size; }

	void set_mini_batch_size (
	unsigned long batch_size
	)
	{
	DLIB_CASSERT(batch_size > 0);
	mini_batch_size = batch_size;
	}

	unsigned long get_max_num_epochs (
	) const { return max_num_epochs; }

	void set_max_num_epochs (
	unsigned long num
	)
	{
	DLIB_CASSERT(num > 0);
	max_num_epochs = num;
	}

	void be_verbose (
	)
	{
	verbose = true;
	}

	void be_quiet (
	)
	{
	verbose = false;
	}


	const std::vector<solver_type>& get_solvers (
	) const
	{
	wait_for_thread_to_pause();
	propagate_exception();
	return devices[0]->solvers;
	}

	void train_one_step (
	const std::vector<input_type>& data,
	const std::vector<training_label_type>& labels
	)
	{
	DLIB_CASSERT(data.size() == labels.size());

	train_one_step(data.begin(), data.end(), labels.begin());
	}

	template <
	typename data_iterator,
	typename label_iterator
	>
	void train_one_step (
	data_iterator dbegin,
	data_iterator dend,
	label_iterator lbegin
	)
	{
	DLIB_CASSERT(std::distance(dbegin, dend) > 0);

	print_periodic_verbose_status();
	sync_to_disk();
	send_job(false, dbegin, dend, lbegin);

	++train_one_step_calls;
	}

	void train_one_step (
	const std::vector<input_type>& data
	)
	{
	train_one_step(data.begin(), data.end());
	}

	template <
	typename data_iterator
	>
	void train_one_step (
	data_iterator dbegin,
	data_iterator dend
	)
	{
	DLIB_CASSERT(std::distance(dbegin, dend) > 0);
	print_periodic_verbose_status();
	sync_to_disk();
	send_job(false, dbegin, dend);
	++train_one_step_calls;
	}

	void test_one_step (
	const std::vector<input_type>& data,
	const std::vector<training_label_type>& labels
	)
	{
	DLIB_CASSERT(data.size() == labels.size());

	test_one_step(data.begin(), data.end(), labels.begin());
	}

	template <
	typename data_iterator,
	typename label_iterator
	>
	void test_one_step (
	data_iterator dbegin,
	data_iterator dend,
	label_iterator lbegin
	)
	{
	DLIB_CASSERT(std::distance(dbegin, dend) > 0);

	print_periodic_verbose_status();
	sync_to_disk();
	send_job(true, dbegin, dend, lbegin);

	++test_one_step_calls;
	}

	void test_one_step (
	const std::vector<input_type>& data
	)
	{
	test_one_step(data.begin(), data.end());
	}

	template <
	typename data_iterator
	>
	void test_one_step (
	data_iterator dbegin,
	data_iterator dend
	)
	{
	DLIB_CASSERT(std::distance(dbegin, dend) > 0);
	print_periodic_verbose_status();
	sync_to_disk();
	send_job(true, dbegin, dend);
	++test_one_step_calls;
	}

	void train (
	const std::vector<input_type>& data,
	const std::vector<training_label_type>& labels
	)
	{
	DLIB_CASSERT(data.size() == labels.size() && data.size() > 0);

	// The reason these two loops don't initialize their counter variables but
	// instead use class members is so we can include the state of the loops in the
	// stuff written by sync_to_disk()
	for (;
	epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate;
	++epoch_iteration)
	{
	using namespace std::chrono;
	last_time = system_clock::now();
	clear_average_loss();
	for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size)
	{
	if (verbose)
	{
	auto now_time = system_clock::now();
	if (now_time-last_time > seconds(20))
	{
	last_time = now_time;
	auto iter = epoch_iteration + epoch_pos/(double)data.size();
	std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
	<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
	<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
	print_progress();
	}
	}

	sync_to_disk();
	send_job(false, data.begin()+epoch_pos,
	data.begin()+std::min(epoch_pos+mini_batch_size,data.size()),
	labels.begin()+epoch_pos);
	}
	epoch_pos = 0;

	if (verbose)
	{
	// Capitalize the E in Epoch so it's easy to grep out the lines that
	// are for full epoch status statements.
	std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
	<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
	<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
	print_progress();
	}
	}
	wait_for_thread_to_pause();
	// if we modified the network at all then be sure to sync the final result.
	sync_to_disk(true);
	}

	void train (
	const std::vector<input_type>& data
	)
	{
	DLIB_CASSERT(data.size() > 0);

	const bool has_unsupervised_loss = std::is_same<no_label_type, training_label_type>::value;
	static_assert(has_unsupervised_loss,
	"You can only call this version of train() when using an unsupervised loss.");

	// The reason these two loops don't initialize their counter variables but
	// instead use class members is so we can include the state of the loops in the
	// stuff written by sync_to_disk()
	for (;
	epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate;
	++epoch_iteration)
	{
	using namespace std::chrono;
	last_time = system_clock::now();
	clear_average_loss();
	for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size)
	{
	if (verbose)
	{
	auto now_time = system_clock::now();
	if (now_time-last_time > seconds(20))
	{
	last_time = now_time;
	auto iter = epoch_iteration + epoch_pos/(double)data.size();
	std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
	<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
	<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
	print_progress();
	}
	}

	sync_to_disk();
	send_job(false, data.begin()+epoch_pos,
	data.begin()+std::min(epoch_pos+mini_batch_size,data.size()));
	}
	epoch_pos = 0;

	if (verbose)
	{
	// Capitalize the E in Epoch so it's easy to grep out the lines that
	// are for full epoch status statements.
	std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
	<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
	<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
	print_progress();
	}
	}
	wait_for_thread_to_pause();
	// if we modified the network at all then be sure to sync the final result.
	sync_to_disk(true);
	}

	void set_synchronization_file (
	const std::string& filename,
	std::chrono::seconds time_between_syncs_ = std::chrono::minutes(15)
	)
	{
	last_sync_time = std::chrono::system_clock::now();
	sync_filename = filename;
	time_between_syncs = time_between_syncs_;

	// check if the sync file already exists, if it does we should load it.
	std::ifstream fin(newest_syncfile(), std::ios::binary);
	if (fin)
	deserialize(*this, fin);
	}

	const std::string& get_synchronization_file (
	)
	{
	return sync_filename;
	}

	double get_average_loss (
	) const
	{
	wait_for_thread_to_pause();
	return rs.mean();
	}

	double get_average_test_loss (
	) const
	{
	wait_for_thread_to_pause();
	return rs_test.mean();
	}

	void clear_average_loss (
	)
	{
	wait_for_thread_to_pause();
	rs.clear();
	}

	void set_learning_rate (
	double lr
	)
	{
	DLIB_CASSERT(lr > 0);
	wait_for_thread_to_pause();
	if (learning_rate != lr)
	{
	steps_without_progress = 0;
	test_steps_without_progress = 0;
	previous_loss_values.clear();
	test_previous_loss_values.clear();
	previous_loss_values_to_keep_until_disk_sync.clear();
	}
	learning_rate = lr;
	lr_schedule.set_size(0);
	}

	double get_learning_rate(
	) const
	{
	return learning_rate;
	}

	void set_min_learning_rate (
	double lr
	)
	{
	DLIB_CASSERT(lr > 0);
	wait_for_thread_to_pause();
	lr_schedule.set_size(0);
	min_learning_rate = lr;
	}

	double get_min_learning_rate (
	) const
	{
	return min_learning_rate;
	}

	template <typename EXP>
	void set_learning_rate_schedule (
	const matrix_exp<EXP>& schedule
	)
	{
	DLIB_CASSERT(schedule.size() > 0);
	DLIB_CASSERT(min(schedule) > 0);
	set_learning_rate(schedule(0,0));
	set_min_learning_rate(min(schedule));
	set_learning_rate_shrink_factor(1);
	lr_schedule = matrix_cast<double>(reshape_to_column_vector(schedule));
	lr_schedule_pos = 0;
	}

	const matrix<double,0,1>& get_learning_rate_schedule (
	) const
	{
	return lr_schedule;
	}

	void set_iterations_without_progress_threshold (
	unsigned long thresh
	)
	{
	wait_for_thread_to_pause();
	lr_schedule.set_size(0);
	iter_without_progress_thresh = thresh;
	}

	unsigned long get_iterations_without_progress_threshold (
	) const
	{
	return iter_without_progress_thresh;
	}

	unsigned long get_steps_without_progress (
	) const
	{
	return steps_without_progress;
	}

	void set_test_iterations_without_progress_threshold (
	unsigned long thresh
	)
	{
	wait_for_thread_to_pause();
	lr_schedule.set_size(0);
	test_iter_without_progress_thresh = thresh;
	}

	unsigned long get_test_iterations_without_progress_threshold (
	) const
	{
	return test_iter_without_progress_thresh;
	}

	unsigned long get_test_steps_without_progress (
	) const
	{
	return test_steps_without_progress;
	}

	void set_learning_rate_shrink_factor (
	double shrink
	)
	{
	DLIB_CASSERT(0 < shrink && shrink <= 1);
	wait_for_thread_to_pause();
	lr_schedule.set_size(0);
	learning_rate_shrink = shrink;
	steps_without_progress = 0;
	test_steps_without_progress = 0;
	}

	double get_learning_rate_shrink_factor (
	) const
	{
	return learning_rate_shrink;
	}

	unsigned long long get_train_one_step_calls (
	) const
	{
	return train_one_step_calls;
	}

	unsigned long long get_test_one_step_calls (
	) const
	{
	return test_one_step_calls;
	}

	private:

	void record_test_loss(double loss)
	{
	test_previous_loss_values.push_back(loss);
	if (is_finite(loss))
	rs_test.add(loss);
	// discard really old loss values.
	while (test_previous_loss_values.size() > test_iter_without_progress_thresh)
	test_previous_loss_values.pop_front();
	}

	void record_loss(double loss)
	{
	// This kind of budgeting causes our gradient checking to use a fixed amount of
	// computational resources, regardless of the size of iter_without_progress_thresh.
	gradient_check_budget += 200;

	rs.add(loss);
	previous_loss_values.push_back(loss);
	// discard really old loss values.
	while (previous_loss_values.size() > iter_without_progress_thresh)
	previous_loss_values.pop_front();

	// separately keep another loss history until disk sync
	// (but only if disk sync is enabled)
	if (!sync_filename.empty())
	previous_loss_values_to_keep_until_disk_sync.push_back(loss);
	}

	template <typename T>
	double compute_parameter_gradients(size_t device, job_t& next_job, const T&)
	{
	if (next_job.have_data[device])
	{
	auto&& dev = *devices[device];
	dlib::cuda::set_device(dev.device_id);
	if (next_job.test_only)
	return dev.net.compute_loss(next_job.t[device], next_job.labels[device].begin());
	else
	return dev.net.compute_parameter_gradients(next_job.t[device], next_job.labels[device].begin());
	}
	else
	{
	return 0;
	}
	}

	double compute_parameter_gradients(size_t device, job_t& next_job, const no_label_type&)
	{
	if (next_job.have_data[device])
	{
	auto&& dev = *devices[device];
	dlib::cuda::set_device(dev.device_id);
	no_label_type pick_which_run_update;
	if (next_job.test_only)
	return dev.net.compute_loss(next_job.t[device]);
	else
	return dev.net.compute_parameter_gradients(next_job.t[device]);
	}
	else
	{
	return 0;
	}
	}

	void update_parameters(size_t device)
	{
	auto&& dev = *devices[device];
	dlib::cuda::set_device(dev.device_id);
	dev.net.update_parameters(make_sstack(dev.solvers), learning_rate);
	}

	void thread() try
	{
	training_label_type pick_which_run_update;
	job_t next_job;

	std::vector<dlib::future<double>> losses(devices.size());

	std::vector<tt::multi_device_tensor_averager> averagers;
	// An array of all the parameter tensors in the first network. We will
	// periodically copy these tensors to all the other devices to make sure the
	// different GPUs don't go out of sync.
	std::vector<tensor*> reference_params;
	visit_layer_parameters(devices[0]->net, [&](tensor& t) { reference_params.push_back(&t); });

	// If no external thread pools vector was passed, then create one that will
	// be automatically destructed as soon as the dnn_trainer object goes out of
	// scope.
	if (!thread_pools)
	thread_pools = std::make_shared<threads>();

	auto& tp = *thread_pools;

	// We make separate thread pools with just one thread in them because we want
	// to make sure each device is always executed on the same thread. We care
	// about this because there are thread_local context variables for some cuda
	// components and they get allocated for each combination of thread and device.
	// So if we make sure the same device always uses the same thread this will
	// reduce the number of contexts we allocate from num_devices*num_devices to
	// just num_devices.
	while (tp.size() < devices.size())
	tp.push_back(std::make_shared<thread_pool>(1));


	main_iteration_counter = 0;
	while(job_pipe.dequeue(next_job))
	{
	if (next_job.test_only)
	{
	// compute the testing loss
	for (size_t i = 0; i < devices.size(); ++i)
	tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
	// aggregate loss values from all the network computations.
	double theloss = 0;
	for (auto&& loss : losses)
	theloss += loss.get();
	record_test_loss(theloss/losses.size());

	// Check if we should shrink the learning rate based on how the test
	// error has been doing lately.
	if (learning_rate_shrink != 1)
	{
	test_steps_without_progress = count_steps_without_decrease(test_previous_loss_values);
	if (test_steps_without_progress >= test_iter_without_progress_thresh)
	{
	test_steps_without_progress = count_steps_without_decrease_robust(test_previous_loss_values);
	if (test_steps_without_progress >= test_iter_without_progress_thresh)
	{
	// optimization has flattened out, so drop the learning rate.
	learning_rate = learning_rate_shrink*learning_rate;
	test_steps_without_progress = 0;

	// Empty out some of the previous loss values so that test_steps_without_progress
	// will decrease below test_iter_without_progress_thresh.
	drop_some_test_previous_loss_values();
	}
	}
	}
	continue;
	}

	updated_net_since_last_sync = true;
	++main_iteration_counter;
	// Call compute_parameter_gradients() and update_parameters() but pick the
	// right version for unsupervised or supervised training based on the type
	// of training_label_type.
	for (size_t i = 0; i < devices.size(); ++i)
	tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
	// aggregate loss values from all the network computations.
	double theloss = 0;
	for (auto&& loss : losses)
	theloss += loss.get();
	record_loss(theloss/losses.size());

	// Now, if there is more than one active device we need to synchronize the
	// gradient updates between devices. So we do that now.
	if (devices.size() > 1)
	{
	// if this is the first iteration then we need to setup the averagers.
	// We can't do this outside the loop because the tensors that get
	// averaged need to be allocated to their devices before we call set()
	// so that the averagers can determine how best to average them.
	if (averagers.size() == 0 \|\| sync_file_reloaded)
	{
	averagers = std::vector<tt::multi_device_tensor_averager>(net_type::num_computational_layers);
	// setup the averagers to point to the tensors in the networks.
	std::vector<std::vector<tensor*>> all_tensors(devices.size());
	for (size_t i = 0; i < all_tensors.size(); ++i)
	{
	all_tensors[i].resize(net_type::num_computational_layers);
	visit_layer_parameter_gradients(devices[i]->net, [&](size_t j, tensor& t){
	all_tensors[i][j] = &t;
	});
	}
	// Now set each averager to average the tensors at the same layer in each
	// network.
	for (size_t i = 0; i < net_type::num_computational_layers; ++i)
	{
	std::vector<tensor*> temp(all_tensors.size());
	for (size_t j = 0; j < all_tensors.size(); ++j)
	{
	temp[j] = all_tensors[j][i];
	DLIB_CASSERT(temp[0]->size() == temp[j]->size(),
	"Make sure you don't modify the network structure "
	"or number of parameters after constructing the trainer.");
	}
	// ignore layers that don't have parameters
	if (temp[0]->size() != 0)
	averagers[i].set(temp);
	}

	sync_file_reloaded = false;
	}


	for (auto&& d : devices)
	cuda::device_synchronize(d->device_id);

	for (auto&& avg : averagers)
	avg.average();
	}


	// Now apply all the updates to each device.
	for (size_t i = 0; i < devices.size(); ++i)
	tp[i]->add_task_by_value([&,i](){ if (next_job.have_data[i]) update_parameters(i); });
	// and wait for the updates to all happen.
	for (size_t i = 0; i < devices.size(); ++i)
	tp[i]->wait_for_all_tasks();


	// Every now and then force all the parameters to be the same just to make
	// sure they aren't drifting apart due to any non-deterministic behavior on
	// the GPU. It's also important to do this on the first iteration because
	// the different networks may be initialized differently when tensor data
	// is first passed through them. So this code block deals with these
	// issues.
	if (devices.size() > 1 && main_iteration_counter%2000 == 1)
	{
	for (size_t i = 1; i < devices.size(); ++i)
	{
	visit_layer_parameters(devices[i]->net, [&](size_t j, tensor& t)
	{
	memcpy(t, *reference_params[j]);
	});
	}
	}

	// If we have been running for a while then check if the loss is still
	// dropping. If it isn't then we will reduce the learning rate. Note that we
	// have a "budget" that prevents us from calling
	// count_steps_without_decrease() every iteration. We do this because
	// it can be expensive to compute when previous_loss_values is large.
	if (gradient_check_budget > iter_without_progress_thresh && learning_rate_shrink != 1)
	{
	gradient_check_budget = 0;
	steps_without_progress = count_steps_without_decrease(previous_loss_values);
	if (steps_without_progress >= iter_without_progress_thresh)
	{
	// Double check that we aren't seeing decrease. This second check
	// discards the top 10% largest values and checks again. We do
	// this because sometimes a mini-batch might be bad and cause the
	// loss to suddenly jump up, making count_steps_without_decrease()
	// return a large number. But if we discard the top 10% of the
	// values in previous_loss_values then we are robust to that kind
	// of noise. Another way of looking at it, if the reason
	// count_steps_without_decrease() returns a large value is only
	// because the most recent loss values have suddenly been large,
	// then we shouldn't stop or lower the learning rate. We should
	// keep going until whatever disturbance we hit is damped down.
	steps_without_progress = count_steps_without_decrease_robust(previous_loss_values);
	if (steps_without_progress >= iter_without_progress_thresh)
	{
	// optimization has flattened out, so drop the learning rate.
	learning_rate = learning_rate_shrink*learning_rate;
	steps_without_progress = 0;

	// Empty out some of the previous loss values so that steps_without_progress
	// will decrease below iter_without_progress_thresh.
	drop_some_previous_loss_values();
	}
	}
	}
	else if (lr_schedule.size() != 0) // or use the learning rate schedule if we have one.
	{
	if (lr_schedule_pos < lr_schedule.size())
	learning_rate = lr_schedule(lr_schedule_pos++);
	else
	learning_rate = lr_schedule(lr_schedule.size()-1)*0.99;
	}
	}
	}
	catch(...)
	{
	// If an exception happens then permanently disable the trainer object.
	job_pipe.disable();
	std::lock_guard<std::mutex> lock(eptr_mutex);
	eptr = std::current_exception();
	}

	void wait_for_thread_to_pause() const
	{
	job_pipe.wait_for_num_blocked_dequeues(1);
	}

	const static long string_pad = 11;
	const static long epoch_string_pad = 4;
	const static long lr_string_pad = 4;

	void init()
	{
	max_num_epochs = 10000;
	mini_batch_size = 128;
	verbose = false;
	learning_rate = 1e-2;
	min_learning_rate = 1e-5;
	iter_without_progress_thresh = 2000;
	steps_without_progress = 0;
	test_iter_without_progress_thresh = 500;
	test_steps_without_progress = 0;

	learning_rate_shrink = 0.1;
	epoch_iteration = 0;
	epoch_pos = 0;
	train_one_step_calls = 0;
	test_one_step_calls = 0;
	gradient_check_budget = 0;
	lr_schedule_pos = 0;

	main_iteration_counter = 0;
	main_iteration_counter_at_last_disk_sync = 0;
	prob_loss_increasing_thresh_default_value = 0.99;
	prob_loss_increasing_thresh_max_value = 0.99999;
	prob_loss_increasing_thresh = prob_loss_increasing_thresh_default_value;
	updated_net_since_last_sync = false;
	sync_file_reloaded = false;
	previous_loss_values_dump_amount = 400;
	test_previous_loss_values_dump_amount = 100;

	rs_test = running_stats_decayed<double>(200);

	start();
	}

	// serialize and deserialize are private because we hold net by reference so
	// allowing someone to serialize this training object is weird and will likely
	// result in user errors. However, we use these functions as part of the automatic
	// sync code in this object.
	friend void serialize(const dnn_trainer& item, std::ostream& out)
	{
	item.wait_for_thread_to_pause();
	int version = 13;
	serialize(version, out);

	size_t nl = dnn_trainer::num_layers;
	serialize(nl, out);
	serialize(item.rs, out);
	serialize(item.rs_test, out);
	serialize(item.previous_loss_values, out);
	serialize(item.max_num_epochs, out);
	serialize(item.mini_batch_size, out);
	serialize(item.verbose, out);
	serialize(item.net, out);
	serialize(item.devices[0]->solvers, out);
	serialize(item.learning_rate.load(), out);
	serialize(item.min_learning_rate, out);
	serialize(item.iter_without_progress_thresh.load(), out);
	serialize(item.steps_without_progress.load(), out);
	serialize(item.learning_rate_shrink.load(), out);
	serialize(item.epoch_iteration, out);
	serialize(item.epoch_pos, out);
	serialize(item.train_one_step_calls, out);
	serialize(item.test_one_step_calls, out);
	serialize(item.lr_schedule, out);
	serialize(item.lr_schedule_pos, out);
	serialize(item.test_iter_without_progress_thresh.load(), out);
	serialize(item.test_steps_without_progress.load(), out);
	serialize(item.test_previous_loss_values, out);
	serialize(item.previous_loss_values_dump_amount, out);
	serialize(item.test_previous_loss_values_dump_amount, out);
	serialize(item.previous_loss_values_to_keep_until_disk_sync, out);
	}
	friend void deserialize(dnn_trainer& item, std::istream& in)
	{
	item.wait_for_thread_to_pause();
	int version = 0;
	deserialize(version, in);
	if (version != 13)
	throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");

	size_t num_layers = 0;
	deserialize(num_layers, in);
	if (num_layers != dnn_trainer::num_layers)
	{
	std::ostringstream sout;
	sout << "Error deserializing dlib::dnn_trainer. The saved sync file is for a network with " << std::endl;
	sout << "a different number of layers. We expected the number of layers to be " << dnn_trainer::num_layers << " but" << std::endl;
	sout << "instead the file contains " << num_layers << " layers." << std::endl;
	throw serialization_error(sout.str());
	}

	double dtemp; long ltemp;
	deserialize(item.rs, in);
	deserialize(item.rs_test, in);
	deserialize(item.previous_loss_values, in);
	deserialize(item.max_num_epochs, in);
	deserialize(item.mini_batch_size, in);
	deserialize(item.verbose, in);
	deserialize(item.net, in);
	deserialize(item.devices[0]->solvers, in);
	deserialize(dtemp, in); item.learning_rate = dtemp;
	deserialize(item.min_learning_rate, in);
	deserialize(ltemp, in); item.iter_without_progress_thresh = ltemp;
	deserialize(ltemp, in); item.steps_without_progress = ltemp;
	deserialize(dtemp, in); item.learning_rate_shrink = dtemp;
	deserialize(item.epoch_iteration, in);
	deserialize(item.epoch_pos, in);
	deserialize(item.train_one_step_calls, in);
	deserialize(item.test_one_step_calls, in);
	deserialize(item.lr_schedule, in);
	deserialize(item.lr_schedule_pos, in);
	deserialize(ltemp, in); item.test_iter_without_progress_thresh = ltemp;
	deserialize(ltemp, in); item.test_steps_without_progress = ltemp;
	deserialize(item.test_previous_loss_values, in);
	deserialize(item.previous_loss_values_dump_amount, in);
	deserialize(item.test_previous_loss_values_dump_amount, in);
	deserialize(item.previous_loss_values_to_keep_until_disk_sync, in);

	if (item.devices.size() > 1)
	{
	const auto prev_dev = dlib::cuda::get_device();
	// initialize all the other device networks and solver objects
	for (size_t i = 1; i < item.devices.size(); ++i)
	{
	// Switch to this device so that any tensor objects that get allocated when
	// we copy this stuff happen on this device.
	dlib::cuda::set_device(item.devices[i]->device_id);
	item.devices[i]->solvers = item.devices[0]->solvers;
	item.devices[i]->net = item.devices[0]->net;
	}
	dlib::cuda::set_device(prev_dev);
	}
	}

	// Empty out some of the previous loss values so that steps_without_progress will decrease below iter_without_progress_thresh.
	void drop_some_previous_loss_values()
	{
	for (unsigned long cnt = 0; cnt < previous_loss_values_dump_amount + iter_without_progress_thresh / 10 && previous_loss_values.size() > 0; ++cnt)
	previous_loss_values.pop_front();
	}

	// Empty out some of the previous test loss values so that test_steps_without_progress will decrease below test_iter_without_progress_thresh.
	void drop_some_test_previous_loss_values()
	{
	for (unsigned long cnt = 0; cnt < test_previous_loss_values_dump_amount + test_iter_without_progress_thresh / 10 && test_previous_loss_values.size() > 0; ++cnt)
	test_previous_loss_values.pop_front();
	}

	void sync_to_disk (
	bool do_it_now = false
	)
	{
	// don't sync anything if we haven't updated the network since the last sync
	if (!updated_net_since_last_sync)
	return;

	// If the sync file isn't set then don't do anything.
	if (sync_filename.size() == 0)
	return;

	// Only sync if it has been long enough since the last sync or we are being
	// explicitly forced to do it.
	if (std::chrono::system_clock::now() - last_sync_time > time_between_syncs \|\|
	do_it_now)
	{
	wait_for_thread_to_pause();

	// compact network before saving to disk.
	this->net.clean();

	// if the loss has actually been going up since the last time we saved our
	// state to disk then something has probably gone wrong in the
	// optimization. So in this case we do the opposite and recall the
	// previously saved state in the hopes that the problem won't reoccur.
	if (loss_increased_since_last_disk_sync())
	{
	std::ifstream fin(newest_syncfile(), std::ios::binary);
	deserialize(*this, fin);
	sync_file_reloaded = true;
	if (verbose)
	std::cout << "Loss has been increasing, reloading saved state from " << newest_syncfile() << std::endl;

	// Are we repeatedly hitting our head against the wall? If so, then we
	// might be better off giving up at this learning rate, and trying a
	// lower one instead.
	if (prob_loss_increasing_thresh >= prob_loss_increasing_thresh_max_value)
	{
	if (verbose)
	std::cout << "(and while at it, also shrinking the learning rate)" << std::endl;

	learning_rate = learning_rate_shrink * learning_rate;
	steps_without_progress = 0;
	test_steps_without_progress = 0;

	drop_some_previous_loss_values();
	drop_some_test_previous_loss_values();
	}
	}
	else
	{

	const std::string filename = oldest_syncfile();
	serialize(filename) << *this;

	if (verbose)
	std::cout << "Saved state to " << filename << std::endl;
	}

	last_sync_time = std::chrono::system_clock::now();
	main_iteration_counter_at_last_disk_sync = main_iteration_counter;
	updated_net_since_last_sync = false;
	}
	}

	std::string newest_syncfile (
	)
	{
	return select_newest_file(sync_filename, sync_filename + "_");
	}

	std::string oldest_syncfile (
	)
	{
	return select_oldest_file(sync_filename, sync_filename + "_");
	}

	bool loss_increased_since_last_disk_sync()
	{
	size_t gradient_updates_since_last_sync = main_iteration_counter - main_iteration_counter_at_last_disk_sync;

	// if we haven't synced anything to disk yet then return false.
	if (!std::ifstream(newest_syncfile(), std::ios::binary))
	return false;

	// Now look at the data since a little before the last disk sync. We will
	// check if the loss is getting better or worse.
	while (previous_loss_values_to_keep_until_disk_sync.size() > 2 * gradient_updates_since_last_sync)
	previous_loss_values_to_keep_until_disk_sync.pop_front();

	// Always retry if there are any nan or inf values
	for (auto x : previous_loss_values_to_keep_until_disk_sync)
	{
	if (std::isnan(x) \|\| std::isinf(x))
	return true;
	}

	// if we haven't seen much data yet then just say false.
	if (gradient_updates_since_last_sync < 30)
	return false;

	// if the loss is very likely to be increasing then return true
	const double prob1 = probability_values_are_increasing(previous_loss_values_to_keep_until_disk_sync);
	const double prob2 = probability_values_are_increasing_robust(previous_loss_values_to_keep_until_disk_sync);
	if (std::max(prob1, prob2) > prob_loss_increasing_thresh)
	{
	// Exponentially decay the threshold towards 1 so that if we keep finding
	// the loss to be increasing over and over we will make the test
	// progressively harder and harder until it fails, therefore ensuring we
	// can't get stuck reloading from a previous state over and over.
	prob_loss_increasing_thresh = std::min(
	0.1prob_loss_increasing_thresh + 0.91,
	prob_loss_increasing_thresh_max_value
	);
	return true;
	}
	else
	{
	// decay back to the default threshold
	prob_loss_increasing_thresh = std::pow(prob_loss_increasing_thresh, 10.0);
	// but don't decay below the default value
	prob_loss_increasing_thresh = std::max(prob_loss_increasing_thresh, prob_loss_increasing_thresh_default_value);

	return false;
	}
	}


	struct clone_net{};

	// per device state. All the containers have the same number of objects in them.
	struct device_data
	{
	device_data(
	int device_id_,
	net_type& net_,
	const solver_type& solver_
	) : device_id(device_id_), net(net_), solvers(num_computational_layers, solver_) {}

	device_data(
	int device_id_,
	net_type& net_,
	const solver_type& solver_,
	clone_net
	) : device_id(device_id_), net_copy(std::make_shared<net_type>(net_)), net(*net_copy), solvers(num_computational_layers, solver_) {}

	int device_id;
	std::shared_ptr<net_type> net_copy;
	net_type& net;
	std::vector<solver_type> solvers;
	};

	template <
	typename data_iterator,
	typename label_iterator
	>
	void send_job (
	bool test_only,
	data_iterator dbegin,
	data_iterator dend,
	label_iterator lbegin
	)
	{
	propagate_exception();
	size_t num = std::distance(dbegin, dend);
	size_t devs = devices.size();
	job.t.resize(devs);
	job.labels.resize(devs);
	job.have_data.resize(devs);
	job.test_only = test_only;

	// chop the data into devs blocks, each of about block_size elements.
	const double block_size = num / static_cast<double>(devs);

	const auto prev_dev = dlib::cuda::get_device();

	double j = 0;

	for (size_t i = 0; i < devs; ++i)
	{
	dlib::cuda::set_device(devices[i]->device_id);

	const size_t start = static_cast<size_t>(std::round(j));
	const size_t stop = static_cast<size_t>(std::round(j + block_size));

	if (start < stop)
	{
	devices[i]->net.to_tensor(dbegin+start, dbegin+stop, job.t[i]);
	job.labels[i].assign(lbegin+start, lbegin+stop);
	job.have_data[i] = true;
	}
	else
	{
	job.have_data[i] = false;
	}

	j += block_size;
	}

	DLIB_ASSERT(std::fabs(j - num) < 1e-10);

	dlib::cuda::set_device(prev_dev);
	job_pipe.enqueue(job);
	}

	template <
	typename data_iterator
	>
	void send_job (
	bool test_only,
	data_iterator dbegin,
	data_iterator dend
	)
	{
	typename std::vector<training_label_type>::iterator nothing;
	send_job(test_only, dbegin, dend, nothing);
	}

	void print_progress()
	{
	if (lr_schedule.size() == 0)
	{
	if (test_previous_loss_values.size() == 0)
	std::cout << "steps without apparent progress: " << steps_without_progress;
	else
	std::cout << "steps without apparent progress: train=" << steps_without_progress << ", test=" << test_steps_without_progress;
	}
	else
	{
	std::ostringstream sout;
	sout << "percent complete: " << std::fixed << std::setprecision(2) << 100.0*lr_schedule_pos/(double)lr_schedule.size() << "%";
	std::cout << sout.str();
	}
	std::cout << std::endl;
	}

	void print_periodic_verbose_status()
	{
	if (verbose)
	{
	using namespace std::chrono;
	auto now_time = system_clock::now();
	if (now_time-last_time > seconds(40))
	{
	last_time = now_time;
	std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " "
	<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " ";
	if (test_previous_loss_values.size() == 0)
	{
	std::cout << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
	}
	else
	{
	std::cout << "train loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
	std::cout << "test loss: " << rpad(cast_to_string(get_average_test_loss()),string_pad) << " ";
	}
	print_progress();
	clear_average_loss();
	}
	}
	}

	std::vector<std::shared_ptr<device_data>> devices;
	dlib::pipe<job_t> job_pipe;
	std::shared_ptr<threads> thread_pools;
	job_t job;


	running_stats<double> rs;
	running_stats_decayed<double> rs_test;
	std::deque<double> previous_loss_values;
	unsigned long max_num_epochs;
	size_t mini_batch_size;
	bool verbose;
	net_type& net;
	std::atomic<double> learning_rate;
	double min_learning_rate;
	std::atomic<unsigned long> iter_without_progress_thresh;
	std::atomic<unsigned long> steps_without_progress;

	std::atomic<unsigned long> test_iter_without_progress_thresh;
	std::atomic<unsigned long> test_steps_without_progress;
	std::deque<double> test_previous_loss_values;

	std::deque<double> previous_loss_values_to_keep_until_disk_sync;

	std::atomic<double> learning_rate_shrink;
	std::chrono::time_point<std::chrono::system_clock> last_sync_time;
	std::string sync_filename;
	std::chrono::seconds time_between_syncs;
	unsigned long epoch_iteration;
	size_t epoch_pos;
	std::chrono::time_point<std::chrono::system_clock> last_time;
	unsigned long long train_one_step_calls;
	unsigned long long test_one_step_calls;
	matrix<double,0,1> lr_schedule;
	long lr_schedule_pos;
	unsigned long gradient_check_budget;

	std::exception_ptr eptr = nullptr;
	mutable std::mutex eptr_mutex;
	void propagate_exception() const
	{
	std::lock_guard<std::mutex> lock(eptr_mutex);
	if (eptr)
	std::rethrow_exception(eptr);
	}

	// These 5 variables are not serialized
	size_t main_iteration_counter;
	size_t main_iteration_counter_at_last_disk_sync;
	double prob_loss_increasing_thresh_default_value;
	double prob_loss_increasing_thresh_max_value;
	double prob_loss_increasing_thresh;
	std::atomic<bool> updated_net_since_last_sync;

	bool sync_file_reloaded;
	unsigned long previous_loss_values_dump_amount;
	unsigned long test_previous_loss_values_dump_amount;
	};

	// ----------------------------------------------------------------------------------------

	template <
	typename net_type,
	typename solver_type
	>
	std::ostream& operator<< (
	std::ostream& out,
	dnn_trainer<net_type,solver_type>& trainer
	)
	{
	using std::endl;
	out << "dnn_trainer details: \n";
	out << " net_type::num_layers: " << net_type::num_layers << endl;
	// figure out how big the net is in MB.
	std::ostringstream sout;
	net_type temp = trainer.get_net(); // make a copy so that we can clean it without mutating the trainer's net.
	temp.clean();
	serialize(temp, sout);
	out << " net size: " << sout.str().size()/1024.0/1024.0 << " MiB" << endl;
	// Don't include the loss params in the hash since we print them on the next line.
	// They also aren't really part of the "architecture" of the network.
	out << " net architecture hash: " << md5(cast_to_string(trainer.get_net().subnet())) << endl;
	out << " loss: " << trainer.get_net().loss_details() << endl;

	out << " get_train_one_step_calls(): " << trainer.get_train_one_step_calls() << endl;
	out << " synchronization file: " << trainer.get_synchronization_file() << endl;
	out << " trainer.get_solvers()[0]: " << trainer.get_solvers()[0] << endl;
	out << " mini batch size: " << trainer.get_mini_batch_size() << endl;
	auto sched = trainer.get_learning_rate_schedule();
	if (sched.size() != 0)
	{
	out << " using explicit user-supplied learning rate schedule" << endl;
	}
	else
	{
	out << " learning rate: "<< trainer.get_learning_rate() << endl;
	out << " learning rate shrink factor: "<< trainer.get_learning_rate_shrink_factor() << endl;
	out << " min learning rate: "<< trainer.get_min_learning_rate() << endl;
	out << " iterations without progress threshold: "<< trainer.get_iterations_without_progress_threshold() << endl;
	out << " test iterations without progress threshold: "<< trainer.get_test_iterations_without_progress_threshold() << endl;
	}
	return out;
	}

	// ----------------------------------------------------------------------------------------

	}

	#endif // DLIB_DNn_TRAINER_H_