Spaces:
Running
Running
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
"""Reward functions, distance functions, and reward managers.""" | |
from abc import ABCMeta | |
from abc import abstractmethod | |
from math import log | |
# All sequences here are assumed to be lists of ints bounded | |
# between 0 and `base`-1 (inclusive). | |
################################# | |
### Scalar Distance Functions ### | |
################################# | |
def abs_diff(a, b, base=0): | |
"""Absolute value of difference between scalars. | |
abs_diff is symmetric, i.e. `a` and `b` are interchangeable. | |
Args: | |
a: First argument. An int. | |
b: Seconds argument. An int. | |
base: Dummy argument so that the argument signature matches other scalar | |
diff functions. abs_diff is the same in all bases. | |
Returns: | |
abs(a - b). | |
""" | |
del base # Unused. | |
return abs(a - b) | |
def mod_abs_diff(a, b, base): | |
"""Shortest distance between `a` and `b` in the modular integers base `base`. | |
The smallest distance between a and b is returned. | |
Example: mod_abs_diff(1, 99, 100) ==> 2. It is not 98. | |
mod_abs_diff is symmetric, i.e. `a` and `b` are interchangeable. | |
Args: | |
a: First argument. An int. | |
b: Seconds argument. An int. | |
base: The modulo base. A positive int. | |
Returns: | |
Shortest distance. | |
""" | |
diff = abs(a - b) | |
if diff >= base: | |
diff %= base | |
return min(diff, (-diff) + base) | |
############################### | |
### List Distance Functions ### | |
############################### | |
def absolute_distance(pred, target, base, scalar_diff_fn=abs_diff): | |
"""Asymmetric list distance function. | |
List distance is the sum of element-wise distances, like Hamming distance, but | |
where `pred` can be longer or shorter than `target`. For each position in both | |
`pred` and `target`, distance between those elements is computed with | |
`scalar_diff_fn`. For missing or extra elements in `pred`, the maximum | |
distance is assigned, which is equal to `base`. | |
Distance is 0 when `pred` and `target` are identical, and will be a positive | |
integer when they are not. | |
Args: | |
pred: Prediction list. Distance from this list is computed. | |
target: Target list. Distance to this list is computed. | |
base: The integer base to use. For example, a list of chars would use base | |
256. | |
scalar_diff_fn: Element-wise distance function. | |
Returns: | |
List distance between `pred` and `target`. | |
""" | |
d = 0 | |
for i, target_t in enumerate(target): | |
if i >= len(pred): | |
d += base # A missing slot is worth the max distance. | |
else: | |
# Add element-wise distance for this slot. | |
d += scalar_diff_fn(pred[i], target_t, base) | |
if len(pred) > len(target): | |
# Each extra slot is worth the max distance. | |
d += (len(pred) - len(target)) * base | |
return d | |
def log_absolute_distance(pred, target, base): | |
"""Asymmetric list distance function that uses log distance. | |
A list distance which computes sum of element-wise distances, similar to | |
`absolute_distance`. Unlike `absolute_distance`, this scales the resulting | |
distance to be a float. | |
Element-wise distance are log-scale. Distance between two list changes | |
relatively less for elements that are far apart, but changes a lot (goes to 0 | |
faster) when values get close together. | |
Args: | |
pred: List of ints. Computes distance from this list to the target. | |
target: List of ints. This is the "correct" list which the prediction list | |
is trying to match. | |
base: Integer base. | |
Returns: | |
Float distance normalized so that when `pred` is at most as long as `target` | |
the distance is between 0.0 and 1.0. Distance grows unboundedly large | |
as `pred` grows past `target` in length. | |
""" | |
if not target: | |
length_normalizer = 1.0 | |
if not pred: | |
# Distance between [] and [] is 0.0 since they are equal. | |
return 0.0 | |
else: | |
length_normalizer = float(len(target)) | |
# max_dist is the maximum element-wise distance, before taking log and | |
# scaling. Since we use `mod_abs_diff`, it would be (base // 2), but we add | |
# 1 to it so that missing or extra positions get the maximum penalty. | |
max_dist = base // 2 + 1 | |
# The log-distance will be scaled by a factor. | |
# Note: +1 is added to the numerator and denominator to avoid log(0). This | |
# only has a translational effect, i.e. log(dist + 1) / log(max_dist + 1). | |
factor = log(max_dist + 1) | |
d = 0.0 # Total distance to be computed. | |
for i, target_t in enumerate(target): | |
if i >= len(pred): | |
# Assign the max element-wise distance for missing positions. This is 1.0 | |
# after scaling. | |
d += 1.0 | |
else: | |
# Add the log-dist divided by a scaling factor. | |
d += log(mod_abs_diff(pred[i], target_t, base) + 1) / factor | |
if len(pred) > len(target): | |
# Add the max element-wise distance for each extra position. | |
# Since max dist after scaling is 1, this is just the difference in list | |
# lengths. | |
d += (len(pred) - len(target)) | |
return d / length_normalizer # Normalize again by the target length. | |
######################## | |
### Reward Functions ### | |
######################## | |
# Reward functions assign reward based on program output. | |
# Warning: only use these functions as the terminal rewards in episodes, i.e. | |
# for the "final" programs. | |
def absolute_distance_reward(pred, target, base, scalar_diff_fn=abs_diff): | |
"""Reward function based on absolute_distance function. | |
Maximum reward, 1.0, is given when the lists are equal. Reward is scaled | |
so that 0.0 reward is given when `pred` is the empty list (assuming `target` | |
is not empty). Reward can go negative when `pred` is longer than `target`. | |
This is an asymmetric reward function, so which list is the prediction and | |
which is the target matters. | |
Args: | |
pred: Prediction sequence. This should be the sequence outputted by the | |
generated code. List of ints n, where 0 <= n < base. | |
target: Target sequence. The correct sequence that the generated code needs | |
to output. List of ints n, where 0 <= n < base. | |
base: Base of the computation. | |
scalar_diff_fn: Element-wise distance function. | |
Returns: | |
Reward computed based on `pred` and `target`. A float. | |
""" | |
unit_dist = float(base * len(target)) | |
if unit_dist == 0: | |
unit_dist = base | |
dist = absolute_distance(pred, target, base, scalar_diff_fn=scalar_diff_fn) | |
return (unit_dist - dist) / unit_dist | |
def absolute_mod_distance_reward(pred, target, base): | |
"""Same as `absolute_distance_reward` but `mod_abs_diff` scalar diff is used. | |
Args: | |
pred: Prediction sequence. This should be the sequence outputted by the | |
generated code. List of ints n, where 0 <= n < base. | |
target: Target sequence. The correct sequence that the generated code needs | |
to output. List of ints n, where 0 <= n < base. | |
base: Base of the computation. | |
Returns: | |
Reward computed based on `pred` and `target`. A float. | |
""" | |
return absolute_distance_reward(pred, target, base, mod_abs_diff) | |
def absolute_log_distance_reward(pred, target, base): | |
"""Compute reward using `log_absolute_distance`. | |
Maximum reward, 1.0, is given when the lists are equal. Reward is scaled | |
so that 0.0 reward is given when `pred` is the empty list (assuming `target` | |
is not empty). Reward can go negative when `pred` is longer than `target`. | |
This is an asymmetric reward function, so which list is the prediction and | |
which is the target matters. | |
This reward function has the nice property that much more reward is given | |
for getting the correct value (at each position) than for there being any | |
value at all. For example, in base 100, lets say pred = [1] * 1000 | |
and target = [10] * 1000. A lot of reward would be given for being 80% | |
accurate (worst element-wise distance is 50, distances here are 9) using | |
`absolute_distance`. `log_absolute_distance` on the other hand will give | |
greater and greater reward increments the closer each predicted value gets to | |
the target. That makes the reward given for accuracy somewhat independant of | |
the base. | |
Args: | |
pred: Prediction sequence. This should be the sequence outputted by the | |
generated code. List of ints n, where 0 <= n < base. | |
target: Target sequence. The correct sequence that the generated code needs | |
to output. List of ints n, where 0 <= n < base. | |
base: Base of the computation. | |
Returns: | |
Reward computed based on `pred` and `target`. A float. | |
""" | |
return 1.0 - log_absolute_distance(pred, target, base) | |
####################### | |
### Reward Managers ### | |
####################### | |
# Reward managers assign reward to many code attempts throughout an episode. | |
class RewardManager(object): | |
"""Reward managers administer reward across an episode. | |
Reward managers are used for "editor" environments. These are environments | |
where the agent has some way to edit its code over time, and run its code | |
many time in the same episode, so that it can make incremental improvements. | |
Reward managers are instantiated with a target sequence, which is the known | |
correct program output. The manager is called on the output from a proposed | |
code, and returns reward. If many proposal outputs are tried, reward may be | |
some stateful function that takes previous tries into account. This is done, | |
in part, so that an agent cannot accumulate unbounded reward just by trying | |
junk programs as often as possible. So reward managers should not give the | |
same reward twice if the next proposal is not better than the last. | |
""" | |
__metaclass__ = ABCMeta | |
def __init__(self, target, base, distance_fn=absolute_distance): | |
self._target = list(target) | |
self._base = base | |
self._distance_fn = distance_fn | |
def __call__(self, sequence): | |
"""Call this reward manager like a function to get reward. | |
Calls to reward manager are stateful, and will take previous sequences | |
into account. Repeated calls with the same sequence may produce different | |
rewards. | |
Args: | |
sequence: List of integers (each between 0 and base - 1). This is the | |
proposal sequence. Reward will be computed based on the distance | |
from this sequence to the target (distance function and target are | |
given in the constructor), as well as previous sequences tried during | |
the lifetime of this object. | |
Returns: | |
Float value. The reward received from this call. | |
""" | |
return 0.0 | |
class DeltaRewardManager(RewardManager): | |
"""Simple reward manager that assigns reward for the net change in distance. | |
Given some (possibly asymmetric) list distance function, gives reward for | |
relative changes in prediction distance to the target. | |
For example, if on the first call the distance is 3.0, the change in distance | |
is -3 (from starting distance of 0). That relative change will be scaled to | |
produce a negative reward for this step. On the next call, the distance is 2.0 | |
which is a +1 change, and that will be scaled to give a positive reward. | |
If the final call has distance 0 (the target is achieved), that is another | |
positive change of +2. The total reward across all 3 calls is then 0, which is | |
the highest posible episode total. | |
Reward is scaled so that the maximum element-wise distance is worth 1.0. | |
Maximum total episode reward attainable is 0. | |
""" | |
def __init__(self, target, base, distance_fn=absolute_distance): | |
super(DeltaRewardManager, self).__init__(target, base, distance_fn) | |
self._last_diff = 0 | |
def _diff(self, seq): | |
return self._distance_fn(seq, self._target, self._base) | |
def _delta_reward(self, seq): | |
# Reward is relative to previous sequence diff. | |
# Reward is scaled so that maximum token difference is worth 1.0. | |
# Reward = (last_diff - this_diff) / self.base. | |
# Reward is positive if this sequence is closer to the target than the | |
# previous sequence, and negative if this sequence is further away. | |
diff = self._diff(seq) | |
reward = (self._last_diff - diff) / float(self._base) | |
self._last_diff = diff | |
return reward | |
def __call__(self, seq): | |
return self._delta_reward(seq) | |
class FloorRewardManager(RewardManager): | |
"""Assigns positive reward for each step taken closer to the target. | |
Given some (possibly asymmetric) list distance function, gives reward for | |
whenever a new episode minimum distance is reached. No reward is given if | |
the distance regresses to a higher value, so that the sum of rewards | |
for the episode is positive. | |
Reward is scaled so that the maximum element-wise distance is worth 1.0. | |
Maximum total episode reward attainable is len(target). | |
If the prediction sequence is longer than the target, a reward of -1 is given. | |
Subsequence predictions which are also longer get 0 reward. The -1 penalty | |
will be canceled out with a +1 reward when a prediction is given which is at | |
most the length of the target. | |
""" | |
def __init__(self, target, base, distance_fn=absolute_distance): | |
super(FloorRewardManager, self).__init__(target, base, distance_fn) | |
self._last_diff = 0 | |
self._min_diff = self._max_diff() | |
self._too_long_penality_given = False | |
def _max_diff(self): | |
return self._distance_fn([], self._target, self._base) | |
def _diff(self, seq): | |
return self._distance_fn(seq, self._target, self._base) | |
def _delta_reward(self, seq): | |
# Reward is only given if this sequence is closer to the target than any | |
# previous sequence. | |
# Reward is scaled so that maximum token difference is worth 1.0 | |
# Reward = (min_diff - this_diff) / self.base | |
# Reward is always positive. | |
diff = self._diff(seq) | |
if diff < self._min_diff: | |
reward = (self._min_diff - diff) / float(self._base) | |
self._min_diff = diff | |
else: | |
reward = 0.0 | |
return reward | |
def __call__(self, seq): | |
if len(seq) > len(self._target): # Output is too long. | |
if not self._too_long_penality_given: | |
self._too_long_penality_given = True | |
reward = -1.0 | |
else: | |
reward = 0.0 # Don't give this penalty more than once. | |
return reward | |
reward = self._delta_reward(seq) | |
if self._too_long_penality_given: | |
reward += 1.0 # Return the subtracted reward. | |
self._too_long_penality_given = False | |
return reward | |