# Copyright 2019 The dm_control Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

"""Quadruped Domain."""

import collections

from dm_control.suite import quadruped
from dm_control import mujoco
from dm_control.mujoco.wrapper import mjbindings
from dm_control.rl import control
from dm_control.suite import base
from dm_control.suite import common
from dm_control.utils import containers
from dm_control.utils import rewards
from dm_control.utils import xml_tools
from dm_control.utils import io as resources
from lxml import etree
import numpy as np
from scipy import ndimage
import os 

enums = mjbindings.enums
mjlib = mjbindings.mjlib


_DEFAULT_TIME_LIMIT = 20
_CONTROL_TIMESTEP = .02

# Horizontal speeds above which the move reward is 1.
_RUN_SPEED = 5
_WALK_SPEED = 0.5

_JUMP_HEIGHT = 1.0 # -also good for foot up
_LIE_DOWN_HEIGHT = 0.2
_FOOT_DOWN_HEIGHT = 0.2
_FOOT_UP_HEIGHT = 0.8

# Constants related to terrain generation.
_HEIGHTFIELD_ID = 0
_TERRAIN_SMOOTHNESS = 0.15  # 0.0: maximally bumpy; 1.0: completely smooth.
_TERRAIN_BUMP_SCALE = 2  # Spatial scale of terrain bumps (in meters).

# Named model elements.
_TOES = ['toe_front_left', 'toe_back_left', 'toe_back_right', 'toe_front_right']
_WALLS = ['wall_px', 'wall_py', 'wall_nx', 'wall_ny']

def make(task,
         task_kwargs=None,
         environment_kwargs=None,
         visualize_reward=False):
    task_kwargs = task_kwargs or {}
    if environment_kwargs is not None:
        task_kwargs = task_kwargs.copy()
        task_kwargs['environment_kwargs'] = environment_kwargs
    env = SUITE[task](**task_kwargs)
    env.task.visualize_reward = visualize_reward
    return env

def get_model_and_assets():
    """Returns a tuple containing the model XML string and a dict of assets."""
    root_dir = os.path.dirname(os.path.dirname(__file__))
    xml = resources.GetResource(
        os.path.join(root_dir, 'custom_dmc_tasks', 'quadruped.xml'))
    return xml, common.ASSETS


def make_model(floor_size=None, terrain=False, rangefinders=False,
               walls_and_ball=False):
  """Returns the model XML string."""
  root_dir = os.path.dirname(os.path.dirname(__file__))
  xml_string = common.read_model(os.path.join(root_dir, 'custom_dmc_tasks', 'quadruped.xml'))
  parser = etree.XMLParser(remove_blank_text=True)
  mjcf = etree.XML(xml_string, parser)

  # Set floor size.
  if floor_size is not None:
    floor_geom = mjcf.find('.//geom[@name=\'floor\']')
    floor_geom.attrib['size'] = f'{floor_size} {floor_size} .5'

  # Remove walls, ball and target.
  if not walls_and_ball:
    for wall in _WALLS:
      wall_geom = xml_tools.find_element(mjcf, 'geom', wall)
      wall_geom.getparent().remove(wall_geom)

    # Remove ball.
    ball_body = xml_tools.find_element(mjcf, 'body', 'ball')
    ball_body.getparent().remove(ball_body)

    # Remove target.
    target_site = xml_tools.find_element(mjcf, 'site', 'target')
    target_site.getparent().remove(target_site)

  # Remove terrain.
  if not terrain:
    terrain_geom = xml_tools.find_element(mjcf, 'geom', 'terrain')
    terrain_geom.getparent().remove(terrain_geom)

  # Remove rangefinders if they're not used, as range computations can be
  # expensive, especially in a scene with heightfields.
  if not rangefinders:
    rangefinder_sensors = mjcf.findall('.//rangefinder')
    for rf in rangefinder_sensors:
      rf.getparent().remove(rf)

  return etree.tostring(mjcf, pretty_print=True)


@quadruped.SUITE.add('custom')
def lie_down(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Walk task."""
  xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _WALK_SPEED)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Stand(goal='lie_down', random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)


@quadruped.SUITE.add('custom')
def two_legs(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Walk task."""
  xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _WALK_SPEED)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Stand(goal='two_legs', random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)


@quadruped.SUITE.add('custom')
def stand(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Walk task."""
  xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _WALK_SPEED)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Stand(goal='stand', random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)

@quadruped.SUITE.add('custom')
def jump(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Walk task."""
  xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _WALK_SPEED)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Jump(desired_height=_JUMP_HEIGHT, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)

@quadruped.SUITE.add('custom')
def roll(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Walk task."""
  xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _WALK_SPEED)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Roll(desired_speed=_WALK_SPEED, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)

@quadruped.SUITE.add('custom')
def roll_fast(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Walk task."""
  xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _WALK_SPEED)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Roll(desired_speed=_RUN_SPEED, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)

class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Quadruped domain."""

  def _reload_from_data(self, data):
    super()._reload_from_data(data)
    # Clear cached sensor names when the physics is reloaded.
    self._sensor_types_to_names = {}
    self._hinge_names = []

  def _get_sensor_names(self, *sensor_types):
    try:
      sensor_names = self._sensor_types_to_names[sensor_types]
    except KeyError:
      [sensor_ids] = np.where(np.in1d(self.model.sensor_type, sensor_types))
      sensor_names = [self.model.id2name(s_id, 'sensor') for s_id in sensor_ids]
      self._sensor_types_to_names[sensor_types] = sensor_names
    return sensor_names

  def torso_upright(self):
    """Returns the dot-product of the torso z-axis and the global z-axis."""
    return np.asarray(self.named.data.xmat['torso', 'zz'])

  def torso_velocity(self):
    """Returns the velocity of the torso, in the local frame."""
    return self.named.data.sensordata['velocimeter'].copy()
  
  def com_height(self):
    return self.named.data.sensordata['center_of_mass'].copy()[2]

  def egocentric_state(self):
    """Returns the state without global orientation or position."""
    if not self._hinge_names:
      [hinge_ids] = np.nonzero(self.model.jnt_type ==
                               enums.mjtJoint.mjJNT_HINGE)
      self._hinge_names = [self.model.id2name(j_id, 'joint')
                           for j_id in hinge_ids]
    return np.hstack((self.named.data.qpos[self._hinge_names],
                      self.named.data.qvel[self._hinge_names],
                      self.data.act))

  def toe_positions(self):
    """Returns toe positions in egocentric frame."""
    torso_frame = self.named.data.xmat['torso'].reshape(3, 3)
    torso_pos = self.named.data.xpos['torso']
    torso_to_toe = self.named.data.xpos[_TOES] - torso_pos
    return torso_to_toe.dot(torso_frame)

  def force_torque(self):
    """Returns scaled force/torque sensor readings at the toes."""
    force_torque_sensors = self._get_sensor_names(enums.mjtSensor.mjSENS_FORCE,
                                                  enums.mjtSensor.mjSENS_TORQUE)
    return np.arcsinh(self.named.data.sensordata[force_torque_sensors])

  def imu(self):
    """Returns IMU-like sensor readings."""
    imu_sensors = self._get_sensor_names(enums.mjtSensor.mjSENS_GYRO,
                                         enums.mjtSensor.mjSENS_ACCELEROMETER)
    return self.named.data.sensordata[imu_sensors]

  def rangefinder(self):
    """Returns scaled rangefinder sensor readings."""
    rf_sensors = self._get_sensor_names(enums.mjtSensor.mjSENS_RANGEFINDER)
    rf_readings = self.named.data.sensordata[rf_sensors]
    no_intersection = -1.0
    return np.where(rf_readings == no_intersection, 1.0, np.tanh(rf_readings))

  def origin_distance(self):
    """Returns the distance from the origin to the workspace."""
    return np.asarray(np.linalg.norm(self.named.data.site_xpos['workspace']))

  def origin(self):
    """Returns origin position in the torso frame."""
    torso_frame = self.named.data.xmat['torso'].reshape(3, 3)
    torso_pos = self.named.data.xpos['torso']
    return -torso_pos.dot(torso_frame)

  def ball_state(self):
    """Returns ball position and velocity relative to the torso frame."""
    data = self.named.data
    torso_frame = data.xmat['torso'].reshape(3, 3)
    ball_rel_pos = data.xpos['ball'] - data.xpos['torso']
    ball_rel_vel = data.qvel['ball_root'][:3] - data.qvel['root'][:3]
    ball_rot_vel = data.qvel['ball_root'][3:]
    ball_state = np.vstack((ball_rel_pos, ball_rel_vel, ball_rot_vel))
    return ball_state.dot(torso_frame).ravel()

  def target_position(self):
    """Returns target position in torso frame."""
    torso_frame = self.named.data.xmat['torso'].reshape(3, 3)
    torso_pos = self.named.data.xpos['torso']
    torso_to_target = self.named.data.site_xpos['target'] - torso_pos
    return torso_to_target.dot(torso_frame)

  def ball_to_target_distance(self):
    """Returns horizontal distance from the ball to the target."""
    ball_to_target = (self.named.data.site_xpos['target'] -
                      self.named.data.xpos['ball'])
    return np.linalg.norm(ball_to_target[:2])

  def self_to_ball_distance(self):
    """Returns horizontal distance from the quadruped workspace to the ball."""
    self_to_ball = (self.named.data.site_xpos['workspace']
                    -self.named.data.xpos['ball'])
    return np.linalg.norm(self_to_ball[:2])


def _find_non_contacting_height(physics, orientation, x_pos=0.0, y_pos=0.0):
  """Find a height with no contacts given a body orientation.
  Args:
    physics: An instance of `Physics`.
    orientation: A quaternion.
    x_pos: A float. Position along global x-axis.
    y_pos: A float. Position along global y-axis.
  Raises:
    RuntimeError: If a non-contacting configuration has not been found after
    10,000 attempts.
  """
  z_pos = 0.0  # Start embedded in the floor.
  num_contacts = 1
  num_attempts = 0
  # Move up in 1cm increments until no contacts.
  while num_contacts > 0:
    try:
      with physics.reset_context():
        physics.named.data.qpos['root'][:3] = x_pos, y_pos, z_pos
        physics.named.data.qpos['root'][3:] = orientation
    except control.PhysicsError:
      # We may encounter a PhysicsError here due to filling the contact
      # buffer, in which case we simply increment the height and continue.
      pass
    num_contacts = physics.data.ncon
    z_pos += 0.01
    num_attempts += 1
    if num_attempts > 10000:
      raise RuntimeError('Failed to find a non-contacting configuration.')


def _common_observations(physics):
  """Returns the observations common to all tasks."""
  obs = collections.OrderedDict()
  obs['egocentric_state'] = physics.egocentric_state()
  obs['torso_velocity'] = physics.torso_velocity()
  obs['torso_upright'] = physics.torso_upright()
  obs['imu'] = physics.imu()
  obs['force_torque'] = physics.force_torque()
  return obs

def _lie_down_reward(physics, deviation_angle=0):
  """Returns a reward proportional to how upright the torso is.
  Args:
    physics: an instance of `Physics`.
    deviation_angle: A float, in degrees. The reward is 0 when the torso is
      exactly upside-down and 1 when the torso's z-axis is less than
      `deviation_angle` away from the global z-axis.
  """
  torso = physics.named.data.xpos['torso', 'z']
  return rewards.tolerance(
                  torso,
                  bounds=(-float('inf'), _LIE_DOWN_HEIGHT),
                  margin=_LIE_DOWN_HEIGHT * 1.5)


def _two_legs_reward(physics, deviation_angle=0):
  """Returns a reward proportional to how upright the torso is.
  Args:
    physics: an instance of `Physics`.
    deviation_angle: A float, in degrees. The reward is 0 when the torso is
      exactly upside-down and 1 when the torso's z-axis is less than
      `deviation_angle` away from the global z-axis.
  """
  toes = []
  for t in ['toe_front_left', 'toe_front_right', 'toe_back_left', 'toe_back_right']:
    toe = physics.named.data.xpos[t, 'z']
    toes.append(toe)
  toes = sorted(toes)
  min_toes = sum(toes[:2]) / 2
  max_toes = sum(toes[2:]) / 2
  toes_up = rewards.tolerance(
                  max_toes,
                  bounds=(_FOOT_UP_HEIGHT, float('inf')),
                  margin=_FOOT_UP_HEIGHT // 2)
  toes_down = rewards.tolerance(
                                min_toes,
                                bounds=(-float('inf'), _FOOT_DOWN_HEIGHT),
                                margin=_FOOT_DOWN_HEIGHT * 1.5)
  return toes_down * toes_up


def _upright_reward(physics, deviation_angle=0):
  """Returns a reward proportional to how upright the torso is.
  Args:
    physics: an instance of `Physics`.
    deviation_angle: A float, in degrees. The reward is 0 when the torso is
      exactly upside-down and 1 when the torso's z-axis is less than
      `deviation_angle` away from the global z-axis.
  """
  deviation = np.cos(np.deg2rad(deviation_angle))
  return rewards.tolerance(
      physics.torso_upright(),
      bounds=(deviation, float('inf')),
      sigmoid='linear',
      margin=1 + deviation,
      value_at_margin=0)


class Move(base.Task):
  """A quadruped task solved by moving forward at a designated speed."""

  def __init__(self, desired_speed, random=None):
    """Initializes an instance of `Move`.
    Args:
      desired_speed: A float. If this value is zero, reward is given simply
        for standing upright. Otherwise this specifies the horizontal velocity
        at which the velocity-dependent reward component is maximized.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._desired_speed = desired_speed
    super().__init__(random=random)

  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Initial configuration.
    orientation = self.random.randn(4)
    orientation /= np.linalg.norm(orientation)
    _find_non_contacting_height(physics, orientation)
    super().initialize_episode(physics)

  def get_observation(self, physics):
    """Returns an observation to the agent."""
    return _common_observations(physics)

  def get_reward(self, physics):
    """Returns a reward to the agent."""

    # Move reward term.
    move_reward = rewards.tolerance(
        physics.torso_velocity()[0],
        bounds=(self._desired_speed, float('inf')),
        margin=self._desired_speed,
        value_at_margin=0.5,
        sigmoid='linear')

    return _upright_reward(physics) * move_reward


class Stand(base.Task):
  """A quadruped task solved by moving forward at a designated speed."""

  def __init__(self, random=None, goal='stand'):
    """Initializes an instance of `Move`.
    Args:
      desired_speed: A float. If this value is zero, reward is given simply
        for standing upright. Otherwise this specifies the horizontal velocity
        at which the velocity-dependent reward component is maximized.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    super().__init__(random=random)
    self._goal = goal

  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Initial configuration.
    orientation = self.random.randn(4)
    orientation /= np.linalg.norm(orientation)
    _find_non_contacting_height(physics, orientation)
    super().initialize_episode(physics)

  def get_observation(self, physics):
    """Returns an observation to the agent."""
    return _common_observations(physics)

  def get_reward(self, physics):
    """Returns a reward to the agent."""
    if self._goal == 'stand':
      return _upright_reward(physics) 
    elif self._goal == 'lie_down':
          return _lie_down_reward(physics) 
    elif self._goal == 'two_legs':
      return _two_legs_reward(physics) 

class Jump(base.Task):
  """A quadruped task solved by moving forward at a designated speed."""

  def __init__(self, desired_height, random=None):
    """Initializes an instance of `Move`.
    Args:
      desired_speed: A float. If this value is zero, reward is given simply
        for standing upright. Otherwise this specifies the horizontal velocity
        at which the velocity-dependent reward component is maximized.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._desired_height = desired_height
    super().__init__(random=random)

  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Initial configuration.
    orientation = self.random.randn(4)
    orientation /= np.linalg.norm(orientation)
    _find_non_contacting_height(physics, orientation)
    super().initialize_episode(physics)

  def get_observation(self, physics):
    """Returns an observation to the agent."""
    return _common_observations(physics)

  def get_reward(self, physics):
    """Returns a reward to the agent."""

    # Move reward term.
    jump_up = rewards.tolerance(
        physics.com_height(),
        bounds=(self._desired_height, float('inf')),
        margin=self._desired_height,
        value_at_margin=0.5,
        sigmoid='linear')

    return _upright_reward(physics) * jump_up


class Roll(base.Task):
  """A quadruped task solved by moving forward at a designated speed."""

  def __init__(self, desired_speed, random=None):
    """Initializes an instance of `Move`.
    Args:
      desired_speed: A float. If this value is zero, reward is given simply
        for standing upright. Otherwise this specifies the horizontal velocity
        at which the velocity-dependent reward component is maximized.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._desired_speed = desired_speed
    super().__init__(random=random)

  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Initial configuration.
    orientation = self.random.randn(4)
    orientation /= np.linalg.norm(orientation)
    _find_non_contacting_height(physics, orientation)
    super().initialize_episode(physics)

  def get_observation(self, physics):
    """Returns an observation to the agent."""
    return _common_observations(physics)

  def get_reward(self, physics):
    """Returns a reward to the agent."""
    # Move reward term.
    move_reward = rewards.tolerance(
        np.linalg.norm(physics.torso_velocity()),
        bounds=(self._desired_speed, float('inf')),
        margin=self._desired_speed,
        value_at_margin=0.5,
        sigmoid='linear')

    return _upright_reward(physics) * move_reward


class Escape(base.Task):
  """A quadruped task solved by escaping a bowl-shaped terrain."""

  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Get heightfield resolution, assert that it is square.
    res = physics.model.hfield_nrow[_HEIGHTFIELD_ID]
    assert res == physics.model.hfield_ncol[_HEIGHTFIELD_ID]
    # Sinusoidal bowl shape.
    row_grid, col_grid = np.ogrid[-1:1:res*1j, -1:1:res*1j]
    radius = np.clip(np.sqrt(col_grid**2 + row_grid**2), .04, 1)
    bowl_shape = .5 - np.cos(2*np.pi*radius)/2
    # Random smooth bumps.
    terrain_size = 2 * physics.model.hfield_size[_HEIGHTFIELD_ID, 0]
    bump_res = int(terrain_size / _TERRAIN_BUMP_SCALE)
    bumps = self.random.uniform(_TERRAIN_SMOOTHNESS, 1, (bump_res, bump_res))
    smooth_bumps = ndimage.zoom(bumps, res / float(bump_res))
    # Terrain is elementwise product.
    terrain = bowl_shape * smooth_bumps
    start_idx = physics.model.hfield_adr[_HEIGHTFIELD_ID]
    physics.model.hfield_data[start_idx:start_idx+res**2] = terrain.ravel()
    super().initialize_episode(physics)

    # If we have a rendering context, we need to re-upload the modified
    # heightfield data.
    if physics.contexts:
      with physics.contexts.gl.make_current() as ctx:
        ctx.call(mjlib.mjr_uploadHField,
                 physics.model.ptr,
                 physics.contexts.mujoco.ptr,
                 _HEIGHTFIELD_ID)

    # Initial configuration.
    orientation = self.random.randn(4)
    orientation /= np.linalg.norm(orientation)
    _find_non_contacting_height(physics, orientation)

  def get_observation(self, physics):
    """Returns an observation to the agent."""
    obs = _common_observations(physics)
    obs['origin'] = physics.origin()
    obs['rangefinder'] = physics.rangefinder()
    return obs

  def get_reward(self, physics):
    """Returns a reward to the agent."""

    # Escape reward term.
    terrain_size = physics.model.hfield_size[_HEIGHTFIELD_ID, 0]
    escape_reward = rewards.tolerance(
        physics.origin_distance(),
        bounds=(terrain_size, float('inf')),
        margin=terrain_size,
        value_at_margin=0,
        sigmoid='linear')

    return _upright_reward(physics, deviation_angle=20) * escape_reward


class Fetch(base.Task):
  """A quadruped task solved by bringing a ball to the origin."""

  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Initial configuration, random azimuth and horizontal position.
    azimuth = self.random.uniform(0, 2*np.pi)
    orientation = np.array((np.cos(azimuth/2), 0, 0, np.sin(azimuth/2)))
    spawn_radius = 0.9 * physics.named.model.geom_size['floor', 0]
    x_pos, y_pos = self.random.uniform(-spawn_radius, spawn_radius, size=(2,))
    _find_non_contacting_height(physics, orientation, x_pos, y_pos)

    # Initial ball state.
    physics.named.data.qpos['ball_root'][:2] = self.random.uniform(
        -spawn_radius, spawn_radius, size=(2,))
    physics.named.data.qpos['ball_root'][2] = 2
    physics.named.data.qvel['ball_root'][:2] = 5*self.random.randn(2)
    super().initialize_episode(physics)

  def get_observation(self, physics):
    """Returns an observation to the agent."""
    obs = _common_observations(physics)
    obs['ball_state'] = physics.ball_state()
    obs['target_position'] = physics.target_position()
    return obs

  def get_reward(self, physics):
    """Returns a reward to the agent."""

    # Reward for moving close to the ball.
    arena_radius = physics.named.model.geom_size['floor', 0] * np.sqrt(2)
    workspace_radius = physics.named.model.site_size['workspace', 0]
    ball_radius = physics.named.model.geom_size['ball', 0]
    reach_reward = rewards.tolerance(
        physics.self_to_ball_distance(),
        bounds=(0, workspace_radius+ball_radius),
        sigmoid='linear',
        margin=arena_radius, value_at_margin=0)

    # Reward for bringing the ball to the target.
    target_radius = physics.named.model.site_size['target', 0]
    fetch_reward = rewards.tolerance(
        physics.ball_to_target_distance(),
        bounds=(0, target_radius),
        sigmoid='linear',
        margin=arena_radius, value_at_margin=0)

    reach_then_fetch = reach_reward * (0.5 + 0.5*fetch_reward)

    return _upright_reward(physics) * reach_then_fetch