Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /brain_coder /common /reward.py

NCTCMumbai

Upload 2571 files

0b8359d almost 2 years ago

raw

history blame contribute delete

14.3 kB

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	"""Reward functions, distance functions, and reward managers."""

	from abc import ABCMeta
	from abc import abstractmethod
	from math import log


	# All sequences here are assumed to be lists of ints bounded
	# between 0 and `base`-1 (inclusive).


	#################################
	### Scalar Distance Functions ###
	#################################


	def abs_diff(a, b, base=0):
	"""Absolute value of difference between scalars.

	abs_diff is symmetric, i.e. `a` and `b` are interchangeable.

	Args:
	a: First argument. An int.
	b: Seconds argument. An int.
	base: Dummy argument so that the argument signature matches other scalar
	diff functions. abs_diff is the same in all bases.

	Returns:
	abs(a - b).
	"""
	del base # Unused.
	return abs(a - b)


	def mod_abs_diff(a, b, base):
	"""Shortest distance between `a` and `b` in the modular integers base `base`.

	The smallest distance between a and b is returned.
	Example: mod_abs_diff(1, 99, 100) ==> 2. It is not 98.

	mod_abs_diff is symmetric, i.e. `a` and `b` are interchangeable.

	Args:
	a: First argument. An int.
	b: Seconds argument. An int.
	base: The modulo base. A positive int.

	Returns:
	Shortest distance.
	"""
	diff = abs(a - b)
	if diff >= base:
	diff %= base
	return min(diff, (-diff) + base)


	###############################
	### List Distance Functions ###
	###############################


	def absolute_distance(pred, target, base, scalar_diff_fn=abs_diff):
	"""Asymmetric list distance function.

	List distance is the sum of element-wise distances, like Hamming distance, but
	where `pred` can be longer or shorter than `target`. For each position in both
	`pred` and `target`, distance between those elements is computed with
	`scalar_diff_fn`. For missing or extra elements in `pred`, the maximum
	distance is assigned, which is equal to `base`.

	Distance is 0 when `pred` and `target` are identical, and will be a positive
	integer when they are not.

	Args:
	pred: Prediction list. Distance from this list is computed.
	target: Target list. Distance to this list is computed.
	base: The integer base to use. For example, a list of chars would use base
	256.
	scalar_diff_fn: Element-wise distance function.

	Returns:
	List distance between `pred` and `target`.
	"""
	d = 0
	for i, target_t in enumerate(target):
	if i >= len(pred):
	d += base # A missing slot is worth the max distance.
	else:
	# Add element-wise distance for this slot.
	d += scalar_diff_fn(pred[i], target_t, base)
	if len(pred) > len(target):
	# Each extra slot is worth the max distance.
	d += (len(pred) - len(target)) * base
	return d


	def log_absolute_distance(pred, target, base):
	"""Asymmetric list distance function that uses log distance.

	A list distance which computes sum of element-wise distances, similar to
	`absolute_distance`. Unlike `absolute_distance`, this scales the resulting
	distance to be a float.

	Element-wise distance are log-scale. Distance between two list changes
	relatively less for elements that are far apart, but changes a lot (goes to 0
	faster) when values get close together.

	Args:
	pred: List of ints. Computes distance from this list to the target.
	target: List of ints. This is the "correct" list which the prediction list
	is trying to match.
	base: Integer base.

	Returns:
	Float distance normalized so that when `pred` is at most as long as `target`
	the distance is between 0.0 and 1.0. Distance grows unboundedly large
	as `pred` grows past `target` in length.
	"""
	if not target:
	length_normalizer = 1.0
	if not pred:
	# Distance between [] and [] is 0.0 since they are equal.
	return 0.0
	else:
	length_normalizer = float(len(target))
	# max_dist is the maximum element-wise distance, before taking log and
	# scaling. Since we use `mod_abs_diff`, it would be (base // 2), but we add
	# 1 to it so that missing or extra positions get the maximum penalty.
	max_dist = base // 2 + 1

	# The log-distance will be scaled by a factor.
	# Note: +1 is added to the numerator and denominator to avoid log(0). This
	# only has a translational effect, i.e. log(dist + 1) / log(max_dist + 1).
	factor = log(max_dist + 1)

	d = 0.0 # Total distance to be computed.
	for i, target_t in enumerate(target):
	if i >= len(pred):
	# Assign the max element-wise distance for missing positions. This is 1.0
	# after scaling.
	d += 1.0
	else:
	# Add the log-dist divided by a scaling factor.
	d += log(mod_abs_diff(pred[i], target_t, base) + 1) / factor
	if len(pred) > len(target):
	# Add the max element-wise distance for each extra position.
	# Since max dist after scaling is 1, this is just the difference in list
	# lengths.
	d += (len(pred) - len(target))
	return d / length_normalizer # Normalize again by the target length.


	########################
	### Reward Functions ###
	########################

	# Reward functions assign reward based on program output.
	# Warning: only use these functions as the terminal rewards in episodes, i.e.
	# for the "final" programs.


	def absolute_distance_reward(pred, target, base, scalar_diff_fn=abs_diff):
	"""Reward function based on absolute_distance function.

	Maximum reward, 1.0, is given when the lists are equal. Reward is scaled
	so that 0.0 reward is given when `pred` is the empty list (assuming `target`
	is not empty). Reward can go negative when `pred` is longer than `target`.

	This is an asymmetric reward function, so which list is the prediction and
	which is the target matters.

	Args:
	pred: Prediction sequence. This should be the sequence outputted by the
	generated code. List of ints n, where 0 <= n < base.
	target: Target sequence. The correct sequence that the generated code needs
	to output. List of ints n, where 0 <= n < base.
	base: Base of the computation.
	scalar_diff_fn: Element-wise distance function.

	Returns:
	Reward computed based on `pred` and `target`. A float.
	"""
	unit_dist = float(base * len(target))
	if unit_dist == 0:
	unit_dist = base
	dist = absolute_distance(pred, target, base, scalar_diff_fn=scalar_diff_fn)
	return (unit_dist - dist) / unit_dist


	def absolute_mod_distance_reward(pred, target, base):
	"""Same as `absolute_distance_reward` but `mod_abs_diff` scalar diff is used.

	Args:
	pred: Prediction sequence. This should be the sequence outputted by the
	generated code. List of ints n, where 0 <= n < base.
	target: Target sequence. The correct sequence that the generated code needs
	to output. List of ints n, where 0 <= n < base.
	base: Base of the computation.

	Returns:
	Reward computed based on `pred` and `target`. A float.
	"""
	return absolute_distance_reward(pred, target, base, mod_abs_diff)


	def absolute_log_distance_reward(pred, target, base):
	"""Compute reward using `log_absolute_distance`.

	Maximum reward, 1.0, is given when the lists are equal. Reward is scaled
	so that 0.0 reward is given when `pred` is the empty list (assuming `target`
	is not empty). Reward can go negative when `pred` is longer than `target`.

	This is an asymmetric reward function, so which list is the prediction and
	which is the target matters.

	This reward function has the nice property that much more reward is given
	for getting the correct value (at each position) than for there being any
	value at all. For example, in base 100, lets say pred = [1] * 1000
	and target = [10] * 1000. A lot of reward would be given for being 80%
	accurate (worst element-wise distance is 50, distances here are 9) using
	`absolute_distance`. `log_absolute_distance` on the other hand will give
	greater and greater reward increments the closer each predicted value gets to
	the target. That makes the reward given for accuracy somewhat independant of
	the base.

	Args:
	pred: Prediction sequence. This should be the sequence outputted by the
	generated code. List of ints n, where 0 <= n < base.
	target: Target sequence. The correct sequence that the generated code needs
	to output. List of ints n, where 0 <= n < base.
	base: Base of the computation.

	Returns:
	Reward computed based on `pred` and `target`. A float.
	"""
	return 1.0 - log_absolute_distance(pred, target, base)


	#######################
	### Reward Managers ###
	#######################

	# Reward managers assign reward to many code attempts throughout an episode.


	class RewardManager(object):
	"""Reward managers administer reward across an episode.

	Reward managers are used for "editor" environments. These are environments
	where the agent has some way to edit its code over time, and run its code
	many time in the same episode, so that it can make incremental improvements.

	Reward managers are instantiated with a target sequence, which is the known
	correct program output. The manager is called on the output from a proposed
	code, and returns reward. If many proposal outputs are tried, reward may be
	some stateful function that takes previous tries into account. This is done,
	in part, so that an agent cannot accumulate unbounded reward just by trying
	junk programs as often as possible. So reward managers should not give the
	same reward twice if the next proposal is not better than the last.
	"""
	__metaclass__ = ABCMeta

	def __init__(self, target, base, distance_fn=absolute_distance):
	self._target = list(target)
	self._base = base
	self._distance_fn = distance_fn

	@abstractmethod
	def __call__(self, sequence):
	"""Call this reward manager like a function to get reward.

	Calls to reward manager are stateful, and will take previous sequences
	into account. Repeated calls with the same sequence may produce different
	rewards.

	Args:
	sequence: List of integers (each between 0 and base - 1). This is the
	proposal sequence. Reward will be computed based on the distance
	from this sequence to the target (distance function and target are
	given in the constructor), as well as previous sequences tried during
	the lifetime of this object.

	Returns:
	Float value. The reward received from this call.
	"""
	return 0.0


	class DeltaRewardManager(RewardManager):
	"""Simple reward manager that assigns reward for the net change in distance.

	Given some (possibly asymmetric) list distance function, gives reward for
	relative changes in prediction distance to the target.

	For example, if on the first call the distance is 3.0, the change in distance
	is -3 (from starting distance of 0). That relative change will be scaled to
	produce a negative reward for this step. On the next call, the distance is 2.0
	which is a +1 change, and that will be scaled to give a positive reward.
	If the final call has distance 0 (the target is achieved), that is another
	positive change of +2. The total reward across all 3 calls is then 0, which is
	the highest posible episode total.

	Reward is scaled so that the maximum element-wise distance is worth 1.0.
	Maximum total episode reward attainable is 0.
	"""

	def __init__(self, target, base, distance_fn=absolute_distance):
	super(DeltaRewardManager, self).__init__(target, base, distance_fn)
	self._last_diff = 0

	def _diff(self, seq):
	return self._distance_fn(seq, self._target, self._base)

	def _delta_reward(self, seq):
	# Reward is relative to previous sequence diff.
	# Reward is scaled so that maximum token difference is worth 1.0.
	# Reward = (last_diff - this_diff) / self.base.
	# Reward is positive if this sequence is closer to the target than the
	# previous sequence, and negative if this sequence is further away.
	diff = self._diff(seq)
	reward = (self._last_diff - diff) / float(self._base)
	self._last_diff = diff
	return reward

	def __call__(self, seq):
	return self._delta_reward(seq)


	class FloorRewardManager(RewardManager):
	"""Assigns positive reward for each step taken closer to the target.

	Given some (possibly asymmetric) list distance function, gives reward for
	whenever a new episode minimum distance is reached. No reward is given if
	the distance regresses to a higher value, so that the sum of rewards
	for the episode is positive.

	Reward is scaled so that the maximum element-wise distance is worth 1.0.
	Maximum total episode reward attainable is len(target).

	If the prediction sequence is longer than the target, a reward of -1 is given.
	Subsequence predictions which are also longer get 0 reward. The -1 penalty
	will be canceled out with a +1 reward when a prediction is given which is at
	most the length of the target.
	"""

	def __init__(self, target, base, distance_fn=absolute_distance):
	super(FloorRewardManager, self).__init__(target, base, distance_fn)
	self._last_diff = 0
	self._min_diff = self._max_diff()
	self._too_long_penality_given = False

	def _max_diff(self):
	return self._distance_fn([], self._target, self._base)

	def _diff(self, seq):
	return self._distance_fn(seq, self._target, self._base)

	def _delta_reward(self, seq):
	# Reward is only given if this sequence is closer to the target than any
	# previous sequence.
	# Reward is scaled so that maximum token difference is worth 1.0
	# Reward = (min_diff - this_diff) / self.base
	# Reward is always positive.
	diff = self._diff(seq)
	if diff < self._min_diff:
	reward = (self._min_diff - diff) / float(self._base)
	self._min_diff = diff
	else:
	reward = 0.0
	return reward

	def __call__(self, seq):
	if len(seq) > len(self._target): # Output is too long.
	if not self._too_long_penality_given:
	self._too_long_penality_given = True
	reward = -1.0
	else:
	reward = 0.0 # Don't give this penalty more than once.
	return reward

	reward = self._delta_reward(seq)
	if self._too_long_penality_given:
	reward += 1.0 # Return the subtracted reward.
	self._too_long_penality_given = False
	return reward