Spaces:

zjowowen
/

gomoku

Sleeping

File size: 55,258 Bytes

079c32c

from typing import List, Dict, Tuple
from ditk import logging
from copy import deepcopy
from easydict import EasyDict
from torch.utils.data import Dataset
from dataclasses import dataclass

import pickle
import easydict
import torch
import numpy as np

from ding.utils.bfs_helper import get_vi_sequence
from ding.utils import DATASET_REGISTRY, import_module, DatasetNormalizer
from ding.rl_utils import discount_cumsum


@dataclass
class DatasetStatistics:
    """
    Overview:
        Dataset statistics.
    """
    mean: np.ndarray  # obs
    std: np.ndarray  # obs
    action_bounds: np.ndarray


@DATASET_REGISTRY.register('naive')
class NaiveRLDataset(Dataset):
    """
    Overview:
        Naive RL dataset, which is used for offline RL algorithms.
    Interfaces:
        ``__init__``, ``__len__``, ``__getitem__``
    """

    def __init__(self, cfg) -> None:
        """
        Overview:
            Initialization method.
        Arguments:
            - cfg (:obj:`dict`): Config dict.
        """

        assert type(cfg) in [str, EasyDict], "invalid cfg type: {}".format(type(cfg))
        if isinstance(cfg, EasyDict):
            self._data_path = cfg.policy.collect.data_path
        elif isinstance(cfg, str):
            self._data_path = cfg
        with open(self._data_path, 'rb') as f:
            self._data: List[Dict[str, torch.Tensor]] = pickle.load(f)

    def __len__(self) -> int:
        """
        Overview:
            Get the length of the dataset.
        """

        return len(self._data)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Overview:
            Get the item of the dataset.
        """

        return self._data[idx]


@DATASET_REGISTRY.register('d4rl')
class D4RLDataset(Dataset):
    """
    Overview:
        D4RL dataset, which is used for offline RL algorithms.
    Interfaces:
        ``__init__``, ``__len__``, ``__getitem__``
    Properties:
        - mean (:obj:`np.ndarray`): Mean of the dataset.
        - std (:obj:`np.ndarray`): Std of the dataset.
        - action_bounds (:obj:`np.ndarray`): Action bounds of the dataset.
        - statistics (:obj:`dict`): Statistics of the dataset.
    """

    def __init__(self, cfg: dict) -> None:
        """
        Overview:
            Initialization method.
        Arguments:
            - cfg (:obj:`dict`): Config dict.
        """

        import gym
        try:
            import d4rl  # register d4rl enviroments with open ai gym
        except ImportError:
            import sys
            logging.warning("not found d4rl env, please install it, refer to https://github.com/rail-berkeley/d4rl")
            sys.exit(1)

        # Init parameters
        data_path = cfg.policy.collect.get('data_path', None)
        env_id = cfg.env.env_id

        # Create the environment
        if data_path:
            d4rl.set_dataset_path(data_path)
        env = gym.make(env_id)
        dataset = d4rl.qlearning_dataset(env)
        self._cal_statistics(dataset, env)
        try:
            if cfg.env.norm_obs.use_norm and cfg.env.norm_obs.offline_stats.use_offline_stats:
                dataset = self._normalize_states(dataset)
        except (KeyError, AttributeError):
            # do not normalize
            pass
        self._data = []
        self._load_d4rl(dataset)

    @property
    def data(self) -> List:
        return self._data

    def __len__(self) -> int:
        """
        Overview:
            Get the length of the dataset.
        """

        return len(self._data)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Overview:
            Get the item of the dataset.
        """

        return self._data[idx]

    def _load_d4rl(self, dataset: Dict[str, np.ndarray]) -> None:
        """
        Overview:
            Load the d4rl dataset.
        Arguments:
            - dataset (:obj:`Dict[str, np.ndarray]`): The d4rl dataset.
        """

        for i in range(len(dataset['observations'])):
            trans_data = {}
            trans_data['obs'] = torch.from_numpy(dataset['observations'][i])
            trans_data['next_obs'] = torch.from_numpy(dataset['next_observations'][i])
            trans_data['action'] = torch.from_numpy(dataset['actions'][i])
            trans_data['reward'] = torch.tensor(dataset['rewards'][i])
            trans_data['done'] = dataset['terminals'][i]
            self._data.append(trans_data)

    def _cal_statistics(self, dataset, env, eps=1e-3, add_action_buffer=True):
        """
        Overview:
            Calculate the statistics of the dataset.
        Arguments:
            - dataset (:obj:`Dict[str, np.ndarray]`): The d4rl dataset.
            - env (:obj:`gym.Env`): The environment.
            - eps (:obj:`float`): Epsilon.
        """

        self._mean = dataset['observations'].mean(0)
        self._std = dataset['observations'].std(0) + eps
        action_max = dataset['actions'].max(0)
        action_min = dataset['actions'].min(0)
        if add_action_buffer:
            action_buffer = 0.05 * (action_max - action_min)
            action_max = (action_max + action_buffer).clip(max=env.action_space.high)
            action_min = (action_min - action_buffer).clip(min=env.action_space.low)
        self._action_bounds = np.stack([action_min, action_max], axis=0)

    def _normalize_states(self, dataset):
        """
        Overview:
            Normalize the states.
        Arguments:
            - dataset (:obj:`Dict[str, np.ndarray]`): The d4rl dataset.
        """

        dataset['observations'] = (dataset['observations'] - self._mean) / self._std
        dataset['next_observations'] = (dataset['next_observations'] - self._mean) / self._std
        return dataset

    @property
    def mean(self):
        """
        Overview:
            Get the mean of the dataset.
        """

        return self._mean

    @property
    def std(self):
        """
        Overview:
            Get the std of the dataset.
        """

        return self._std

    @property
    def action_bounds(self) -> np.ndarray:
        """
        Overview:
            Get the action bounds of the dataset.
        """

        return self._action_bounds

    @property
    def statistics(self) -> dict:
        """
        Overview:
            Get the statistics of the dataset.
        """

        return DatasetStatistics(mean=self.mean, std=self.std, action_bounds=self.action_bounds)


@DATASET_REGISTRY.register('hdf5')
class HDF5Dataset(Dataset):
    """
    Overview:
        HDF5 dataset is saved in hdf5 format, which is used for offline RL algorithms.
        The hdf5 format is a common format for storing large numerical arrays in Python.
        For more details, please refer to https://support.hdfgroup.org/HDF5/.
    Interfaces:
        ``__init__``, ``__len__``, ``__getitem__``
    Properties:
        - mean (:obj:`np.ndarray`): Mean of the dataset.
        - std (:obj:`np.ndarray`): Std of the dataset.
        - action_bounds (:obj:`np.ndarray`): Action bounds of the dataset.
        - statistics (:obj:`dict`): Statistics of the dataset.
    """

    def __init__(self, cfg: dict) -> None:
        """
        Overview:
            Initialization method.
        Arguments:
            - cfg (:obj:`dict`): Config dict.
        """

        try:
            import h5py
        except ImportError:
            import sys
            logging.warning("not found h5py package, please install it trough `pip install h5py ")
            sys.exit(1)
        data_path = cfg.policy.collect.get('data_path', None)
        if 'dataset' in cfg:
            self.context_len = cfg.dataset.context_len
        else:
            self.context_len = 0
        data = h5py.File(data_path, 'r')
        self._load_data(data)
        self._cal_statistics()
        try:
            if cfg.env.norm_obs.use_norm and cfg.env.norm_obs.offline_stats.use_offline_stats:
                self._normalize_states()
        except (KeyError, AttributeError):
            # do not normalize
            pass

    def __len__(self) -> int:
        """
        Overview:
            Get the length of the dataset.
        """

        return len(self._data['obs']) - self.context_len

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Overview:
            Get the item of the dataset.
        Arguments:
            - idx (:obj:`int`): The index of the dataset.
        """

        if self.context_len == 0:  # for other offline RL algorithms
            return {k: self._data[k][idx] for k in self._data.keys()}
        else:  # for decision transformer
            block_size = self.context_len
            done_idx = idx + block_size
            idx = done_idx - block_size
            states = torch.as_tensor(
                np.array(self._data['obs'][idx:done_idx]), dtype=torch.float32
            ).view(block_size, -1)
            actions = torch.as_tensor(self._data['action'][idx:done_idx], dtype=torch.long)
            rtgs = torch.as_tensor(self._data['reward'][idx:done_idx, 0], dtype=torch.float32)
            timesteps = torch.as_tensor(range(idx, done_idx), dtype=torch.int64)
            traj_mask = torch.ones(self.context_len, dtype=torch.long)
            return timesteps, states, actions, rtgs, traj_mask

    def _load_data(self, dataset: Dict[str, np.ndarray]) -> None:
        """
        Overview:
            Load the dataset.
        Arguments:
            - dataset (:obj:`Dict[str, np.ndarray]`): The dataset.
        """

        self._data = {}
        for k in dataset.keys():
            logging.info(f'Load {k} data.')
            self._data[k] = dataset[k][:]

    def _cal_statistics(self, eps: float = 1e-3):
        """
        Overview:
            Calculate the statistics of the dataset.
        Arguments:
            - eps (:obj:`float`): Epsilon.
        """

        self._mean = self._data['obs'].mean(0)
        self._std = self._data['obs'].std(0) + eps
        action_max = self._data['action'].max(0)
        action_min = self._data['action'].min(0)
        buffer = 0.05 * (action_max - action_min)
        action_max = action_max.astype(float) + buffer
        action_min = action_max.astype(float) - buffer
        self._action_bounds = np.stack([action_min, action_max], axis=0)

    def _normalize_states(self):
        """
        Overview:
            Normalize the states.
        """

        self._data['obs'] = (self._data['obs'] - self._mean) / self._std
        self._data['next_obs'] = (self._data['next_obs'] - self._mean) / self._std

    @property
    def mean(self):
        """
        Overview:
            Get the mean of the dataset.
        """

        return self._mean

    @property
    def std(self):
        """
        Overview:
            Get the std of the dataset.
        """

        return self._std

    @property
    def action_bounds(self) -> np.ndarray:
        """
        Overview:
            Get the action bounds of the dataset.
        """

        return self._action_bounds

    @property
    def statistics(self) -> dict:
        """
        Overview:
            Get the statistics of the dataset.
        """

        return DatasetStatistics(mean=self.mean, std=self.std, action_bounds=self.action_bounds)


@DATASET_REGISTRY.register('d4rl_trajectory')
class D4RLTrajectoryDataset(Dataset):
    """
    Overview:
        D4RL trajectory dataset, which is used for offline RL algorithms.
    Interfaces:
        ``__init__``, ``__len__``, ``__getitem__``
    """

    # from infos.py from official d4rl github repo
    REF_MIN_SCORE = {
        'halfcheetah': -280.178953,
        'walker2d': 1.629008,
        'hopper': -20.272305,
    }

    REF_MAX_SCORE = {
        'halfcheetah': 12135.0,
        'walker2d': 4592.3,
        'hopper': 3234.3,
    }

    # calculated from d4rl datasets
    D4RL_DATASET_STATS = {
        'halfcheetah-medium-v2': {
            'state_mean': [
                -0.06845773756504059, 0.016414547339081764, -0.18354906141757965, -0.2762460708618164,
                -0.34061527252197266, -0.09339715540409088, -0.21321271359920502, -0.0877423882484436,
                5.173007488250732, -0.04275195300579071, -0.036108363419771194, 0.14053793251514435,
                0.060498327016830444, 0.09550975263118744, 0.06739100068807602, 0.005627387668937445,
                0.013382787816226482
            ],
            'state_std': [
                0.07472999393939972, 0.3023499846458435, 0.30207309126853943, 0.34417077898979187, 0.17619241774082184,
                0.507205605506897, 0.2567007839679718, 0.3294812738895416, 1.2574149370193481, 0.7600541710853577,
                1.9800915718078613, 6.565362453460693, 7.466367721557617, 4.472222805023193, 10.566964149475098,
                5.671932697296143, 7.4982590675354
            ]
        },
        'halfcheetah-medium-replay-v2': {
            'state_mean': [
                -0.12880703806877136, 0.3738119602203369, -0.14995987713336945, -0.23479078710079193,
                -0.2841278612613678, -0.13096535205841064, -0.20157982409000397, -0.06517726927995682,
                3.4768247604370117, -0.02785065770149231, -0.015035249292850494, 0.07697279006242752,
                0.01266712136566639, 0.027325302362442017, 0.02316424623131752, 0.010438721626996994,
                -0.015839405357837677
            ],
            'state_std': [
                0.17019015550613403, 1.284424901008606, 0.33442774415016174, 0.3672759234905243, 0.26092398166656494,
                0.4784106910228729, 0.3181420564651489, 0.33552637696266174, 2.0931615829467773, 0.8037433624267578,
                1.9044333696365356, 6.573209762573242, 7.572863578796387, 5.069749355316162, 9.10555362701416,
                6.085654258728027, 7.25300407409668
            ]
        },
        'halfcheetah-medium-expert-v2': {
            'state_mean': [
                -0.05667462572455406, 0.024369969964027405, -0.061670560389757156, -0.22351515293121338,
                -0.2675151228904724, -0.07545716315507889, -0.05809682980179787, -0.027675075456500053,
                8.110626220703125, -0.06136331334710121, -0.17986927926540375, 0.25175222754478455, 0.24186332523822784,
                0.2519369423389435, 0.5879552960395813, -0.24090635776519775, -0.030184272676706314
            ],
            'state_std': [
                0.06103534251451492, 0.36054104566574097, 0.45544400811195374, 0.38476887345314026, 0.2218363732099533,
                0.5667523741722107, 0.3196682929992676, 0.2852923572063446, 3.443821907043457, 0.6728139519691467,
                1.8616976737976074, 9.575807571411133, 10.029894828796387, 5.903450012207031, 12.128185272216797,
                6.4811787605285645, 6.378620147705078
            ]
        },
        'walker2d-medium-v2': {
            'state_mean': [
                1.218966007232666, 0.14163373410701752, -0.03704913705587387, -0.13814310729503632, 0.5138224363327026,
                -0.04719110205769539, -0.47288352251052856, 0.042254164814949036, 2.3948874473571777,
                -0.03143199160695076, 0.04466355964541435, -0.023907244205474854, -0.1013401448726654,
                0.09090937674045563, -0.004192637279629707, -0.12120571732521057, -0.5497063994407654
            ],
            'state_std': [
                0.12311358004808426, 0.3241879940032959, 0.11456084251403809, 0.2623065710067749, 0.5640279054641724,
                0.2271878570318222, 0.3837319612503052, 0.7373676896095276, 1.2387926578521729, 0.798020601272583,
                1.5664079189300537, 1.8092705011367798, 3.025604248046875, 4.062486171722412, 1.4586567878723145,
                3.7445690631866455, 5.5851287841796875
            ]
        },
        'walker2d-medium-replay-v2': {
            'state_mean': [
                1.209364652633667, 0.13264022767543793, -0.14371201395988464, -0.2046516090631485, 0.5577612519264221,
                -0.03231537342071533, -0.2784661054611206, 0.19130706787109375, 1.4701707363128662,
                -0.12504704296588898, 0.0564953051507473, -0.09991033375263214, -0.340340256690979, 0.03546293452382088,
                -0.08934258669614792, -0.2992438077926636, -0.5984178185462952
            ],
            'state_std': [
                0.11929835379123688, 0.3562574088573456, 0.25852200388908386, 0.42075422406196594, 0.5202291011810303,
                0.15685082972049713, 0.36770978569984436, 0.7161387801170349, 1.3763766288757324, 0.8632221817970276,
                2.6364643573760986, 3.0134117603302, 3.720684051513672, 4.867283821105957, 2.6681625843048096,
                3.845186948776245, 5.4768385887146
            ]
        },
        'walker2d-medium-expert-v2': {
            'state_mean': [
                1.2294334173202515, 0.16869689524173737, -0.07089081406593323, -0.16197483241558075,
                0.37101927399635315, -0.012209027074277401, -0.42461398243904114, 0.18986578285694122,
                3.162475109100342, -0.018092676997184753, 0.03496946766972542, -0.013921679928898811,
                -0.05937029421329498, -0.19549426436424255, -0.0019200450042262673, -0.062483321875333786,
                -0.27366524934768677
            ],
            'state_std': [
                0.09932824969291687, 0.25981399416923523, 0.15062759816646576, 0.24249176681041718, 0.6758718490600586,
                0.1650741547346115, 0.38140663504600525, 0.6962361335754395, 1.3501490354537964, 0.7641991376876831,
                1.534574270248413, 2.1785972118377686, 3.276582717895508, 4.766193866729736, 1.1716983318328857,
                4.039782524108887, 5.891613960266113
            ]
        },
        'hopper-medium-v2': {
            'state_mean': [
                1.311279058456421, -0.08469521254301071, -0.5382719039916992, -0.07201576232910156, 0.04932365566492081,
                2.1066856384277344, -0.15017354488372803, 0.008783451281487942, -0.2848185896873474,
                -0.18540096282958984, -0.28461286425590515
            ],
            'state_std': [
                0.17790751159191132, 0.05444620922207832, 0.21297138929367065, 0.14530418813228607, 0.6124444007873535,
                0.8517446517944336, 1.4515252113342285, 0.6751695871353149, 1.5362390279769897, 1.616074562072754,
                5.607253551483154
            ]
        },
        'hopper-medium-replay-v2': {
            'state_mean': [
                1.2305138111114502, -0.04371410980820656, -0.44542956352233887, -0.09370097517967224,
                0.09094487875699997, 1.3694725036621094, -0.19992674887180328, -0.022861352190375328,
                -0.5287045240402222, -0.14465883374214172, -0.19652697443962097
            ],
            'state_std': [
                0.1756512075662613, 0.0636928603053093, 0.3438323438167572, 0.19566889107227325, 0.5547984838485718,
                1.051029920578003, 1.158307671546936, 0.7963128685951233, 1.4802359342575073, 1.6540331840515137,
                5.108601093292236
            ]
        },
        'hopper-medium-expert-v2': {
            'state_mean': [
                1.3293815851211548, -0.09836531430482864, -0.5444297790527344, -0.10201650857925415,
                0.02277466468513012, 2.3577215671539307, -0.06349576264619827, -0.00374026270583272,
                -0.1766270101070404, -0.11862941086292267, -0.12097819894552231
            ],
            'state_std': [
                0.17012375593185425, 0.05159067362546921, 0.18141433596611023, 0.16430604457855225, 0.6023368239402771,
                0.7737284898757935, 1.4986555576324463, 0.7483318448066711, 1.7953159809112549, 2.0530025959014893,
                5.725032806396484
            ]
        },
    }

    def __init__(self, cfg: dict) -> None:
        """
        Overview:
            Initialization method.
        Arguments:
            - cfg (:obj:`dict`): Config dict.
        """

        dataset_path = cfg.dataset.data_dir_prefix
        rtg_scale = cfg.dataset.rtg_scale
        self.context_len = cfg.dataset.context_len
        self.env_type = cfg.dataset.env_type

        if 'hdf5' in dataset_path:  # for mujoco env
            try:
                import h5py
                import collections
            except ImportError:
                import sys
                logging.warning("not found h5py package, please install it trough `pip install h5py ")
                sys.exit(1)
            dataset = h5py.File(dataset_path, 'r')

            N = dataset['rewards'].shape[0]
            data_ = collections.defaultdict(list)

            use_timeouts = False
            if 'timeouts' in dataset:
                use_timeouts = True

            episode_step = 0
            paths = []
            for i in range(N):
                done_bool = bool(dataset['terminals'][i])
                if use_timeouts:
                    final_timestep = dataset['timeouts'][i]
                else:
                    final_timestep = (episode_step == 1000 - 1)
                for k in ['observations', 'actions', 'rewards', 'terminals']:
                    data_[k].append(dataset[k][i])
                if done_bool or final_timestep:
                    episode_step = 0
                    episode_data = {}
                    for k in data_:
                        episode_data[k] = np.array(data_[k])
                    paths.append(episode_data)
                    data_ = collections.defaultdict(list)
                episode_step += 1

            self.trajectories = paths

            # calculate state mean and variance and returns_to_go for all traj
            states = []
            for traj in self.trajectories:
                traj_len = traj['observations'].shape[0]
                states.append(traj['observations'])
                # calculate returns to go and rescale them
                traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0) / rtg_scale

            # used for input normalization
            states = np.concatenate(states, axis=0)
            self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

            # normalize states
            for traj in self.trajectories:
                traj['observations'] = (traj['observations'] - self.state_mean) / self.state_std

        elif 'pkl' in dataset_path:
            if 'dqn' in dataset_path:
                # load dataset
                with open(dataset_path, 'rb') as f:
                    self.trajectories = pickle.load(f)

                if isinstance(self.trajectories[0], list):
                    # for our collected dataset, e.g. cartpole/lunarlander case
                    trajectories_tmp = []

                    original_keys = ['obs', 'next_obs', 'action', 'reward']
                    keys = ['observations', 'next_observations', 'actions', 'rewards']
                    trajectories_tmp = [
                        {
                            key: np.stack(
                                [
                                    self.trajectories[eps_index][transition_index][o_key]
                                    for transition_index in range(len(self.trajectories[eps_index]))
                                ],
                                axis=0
                            )
                            for key, o_key in zip(keys, original_keys)
                        } for eps_index in range(len(self.trajectories))
                    ]
                    self.trajectories = trajectories_tmp

                states = []
                for traj in self.trajectories:
                    # traj_len = traj['observations'].shape[0]
                    states.append(traj['observations'])
                    # calculate returns to go and rescale them
                    traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0) / rtg_scale

                # used for input normalization
                states = np.concatenate(states, axis=0)
                self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

                # normalize states
                for traj in self.trajectories:
                    traj['observations'] = (traj['observations'] - self.state_mean) / self.state_std
            else:
                # load dataset
                with open(dataset_path, 'rb') as f:
                    self.trajectories = pickle.load(f)

                states = []
                for traj in self.trajectories:
                    states.append(traj['observations'])
                    # calculate returns to go and rescale them
                    traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0) / rtg_scale

                # used for input normalization
                states = np.concatenate(states, axis=0)
                self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

                # normalize states
                for traj in self.trajectories:
                    traj['observations'] = (traj['observations'] - self.state_mean) / self.state_std
        else:
            # -- load data from memory (make more efficient)
            obss = []
            actions = []
            returns = [0]
            done_idxs = []
            stepwise_returns = []

            transitions_per_buffer = np.zeros(50, dtype=int)
            num_trajectories = 0
            while len(obss) < cfg.dataset.num_steps:
                buffer_num = np.random.choice(np.arange(50 - cfg.dataset.num_buffers, 50), 1)[0]
                i = transitions_per_buffer[buffer_num]
                frb = FixedReplayBuffer(
                    data_dir=cfg.dataset.data_dir_prefix + '/1/replay_logs',
                    replay_suffix=buffer_num,
                    observation_shape=(84, 84),
                    stack_size=4,
                    update_horizon=1,
                    gamma=0.99,
                    observation_dtype=np.uint8,
                    batch_size=32,
                    replay_capacity=100000
                )
                if frb._loaded_buffers:
                    done = False
                    curr_num_transitions = len(obss)
                    trajectories_to_load = cfg.dataset.trajectories_per_buffer
                    while not done:
                        states, ac, ret, next_states, next_action, next_reward, terminal, indices = \
                        frb.sample_transition_batch(batch_size=1, indices=[i])
                        states = states.transpose((0, 3, 1, 2))[0]  # (1, 84, 84, 4) --> (4, 84, 84)
                        obss.append(states)
                        actions.append(ac[0])
                        stepwise_returns.append(ret[0])
                        if terminal[0]:
                            done_idxs.append(len(obss))
                            returns.append(0)
                            if trajectories_to_load == 0:
                                done = True
                            else:
                                trajectories_to_load -= 1
                        returns[-1] += ret[0]
                        i += 1
                        if i >= 100000:
                            obss = obss[:curr_num_transitions]
                            actions = actions[:curr_num_transitions]
                            stepwise_returns = stepwise_returns[:curr_num_transitions]
                            returns[-1] = 0
                            i = transitions_per_buffer[buffer_num]
                            done = True
                    num_trajectories += (cfg.dataset.trajectories_per_buffer - trajectories_to_load)
                    transitions_per_buffer[buffer_num] = i

            actions = np.array(actions)
            returns = np.array(returns)
            stepwise_returns = np.array(stepwise_returns)
            done_idxs = np.array(done_idxs)

            # -- create reward-to-go dataset
            start_index = 0
            rtg = np.zeros_like(stepwise_returns)
            for i in done_idxs:
                i = int(i)
                curr_traj_returns = stepwise_returns[start_index:i]
                for j in range(i - 1, start_index - 1, -1):  # start from i-1
                    rtg_j = curr_traj_returns[j - start_index:i - start_index]
                    rtg[j] = sum(rtg_j)
                start_index = i

            # -- create timestep dataset
            start_index = 0
            timesteps = np.zeros(len(actions) + 1, dtype=int)
            for i in done_idxs:
                i = int(i)
                timesteps[start_index:i + 1] = np.arange(i + 1 - start_index)
                start_index = i + 1

            self.obss = obss
            self.actions = actions
            self.done_idxs = done_idxs
            self.rtgs = rtg
            self.timesteps = timesteps
            # return obss, actions, returns, done_idxs, rtg, timesteps

    def get_max_timestep(self) -> int:
        """
        Overview:
            Get the max timestep of the dataset.
        """

        return max(self.timesteps)

    def get_state_stats(self) -> Tuple[np.ndarray, np.ndarray]:
        """
        Overview:
            Get the state mean and std of the dataset.
        """

        return deepcopy(self.state_mean), deepcopy(self.state_std)

    def get_d4rl_dataset_stats(self, env_d4rl_name: str) -> Dict[str, list]:
        """
        Overview:
            Get the d4rl dataset stats.
        Arguments:
            - env_d4rl_name (:obj:`str`): The d4rl env name.
        """

        return self.D4RL_DATASET_STATS[env_d4rl_name]

    def __len__(self) -> int:
        """
        Overview:
            Get the length of the dataset.
        """

        if self.env_type != 'atari':
            return len(self.trajectories)
        else:
            return len(self.obss) - self.context_len

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Overview:
            Get the item of the dataset.
        Arguments:
            - idx (:obj:`int`): The index of the dataset.
        """

        if self.env_type != 'atari':
            traj = self.trajectories[idx]
            traj_len = traj['observations'].shape[0]

            if traj_len > self.context_len:
                # sample random index to slice trajectory
                si = np.random.randint(0, traj_len - self.context_len)

                states = torch.from_numpy(traj['observations'][si:si + self.context_len])
                actions = torch.from_numpy(traj['actions'][si:si + self.context_len])
                returns_to_go = torch.from_numpy(traj['returns_to_go'][si:si + self.context_len])
                timesteps = torch.arange(start=si, end=si + self.context_len, step=1)

                # all ones since no padding
                traj_mask = torch.ones(self.context_len, dtype=torch.long)

            else:
                padding_len = self.context_len - traj_len

                # padding with zeros
                states = torch.from_numpy(traj['observations'])
                states = torch.cat(
                    [states, torch.zeros(([padding_len] + list(states.shape[1:])), dtype=states.dtype)], dim=0
                )

                actions = torch.from_numpy(traj['actions'])
                actions = torch.cat(
                    [actions, torch.zeros(([padding_len] + list(actions.shape[1:])), dtype=actions.dtype)], dim=0
                )

                returns_to_go = torch.from_numpy(traj['returns_to_go'])
                returns_to_go = torch.cat(
                    [
                        returns_to_go,
                        torch.zeros(([padding_len] + list(returns_to_go.shape[1:])), dtype=returns_to_go.dtype)
                    ],
                    dim=0
                )

                timesteps = torch.arange(start=0, end=self.context_len, step=1)

                traj_mask = torch.cat(
                    [torch.ones(traj_len, dtype=torch.long),
                     torch.zeros(padding_len, dtype=torch.long)], dim=0
                )
            return timesteps, states, actions, returns_to_go, traj_mask
        else:  # mean cost less than 0.001s
            block_size = self.context_len
            done_idx = idx + block_size
            for i in self.done_idxs:
                if i > idx:  # first done_idx greater than idx
                    done_idx = min(int(i), done_idx)
                    break
            idx = done_idx - block_size
            states = torch.as_tensor(
                np.array(self.obss[idx:done_idx]), dtype=torch.float32
            ).view(block_size, -1)  # (block_size, 4*84*84)
            states = states / 255.
            actions = torch.as_tensor(self.actions[idx:done_idx], dtype=torch.long).unsqueeze(1)  # (block_size, 1)
            rtgs = torch.as_tensor(self.rtgs[idx:done_idx], dtype=torch.float32).unsqueeze(1)
            timesteps = torch.as_tensor(self.timesteps[idx:idx + 1], dtype=torch.int64).unsqueeze(1)
            traj_mask = torch.ones(self.context_len, dtype=torch.long)
            return timesteps, states, actions, rtgs, traj_mask


@DATASET_REGISTRY.register('d4rl_diffuser')
class D4RLDiffuserDataset(Dataset):
    """
    Overview:
        D4RL diffuser dataset, which is used for offline RL algorithms.
    Interfaces:
        ``__init__``, ``__len__``, ``__getitem__``
    """

    def __init__(self, dataset_path: str, context_len: int, rtg_scale: float) -> None:
        """
        Overview:
            Initialization method of D4RLDiffuserDataset.
        Arguments:
            - dataset_path (:obj:`str`): The dataset path.
            - context_len (:obj:`int`): The length of the context.
            - rtg_scale (:obj:`float`): The scale of the returns to go.
        """

        self.context_len = context_len

        # load dataset
        with open(dataset_path, 'rb') as f:
            self.trajectories = pickle.load(f)

        if isinstance(self.trajectories[0], list):
            # for our collected dataset, e.g. cartpole/lunarlander case
            trajectories_tmp = []

            original_keys = ['obs', 'next_obs', 'action', 'reward']
            keys = ['observations', 'next_observations', 'actions', 'rewards']
            for key, o_key in zip(keys, original_keys):
                trajectories_tmp = [
                    {
                        key: np.stack(
                            [
                                self.trajectories[eps_index][transition_index][o_key]
                                for transition_index in range(len(self.trajectories[eps_index]))
                            ],
                            axis=0
                        )
                    } for eps_index in range(len(self.trajectories))
                ]
            self.trajectories = trajectories_tmp

        states = []
        for traj in self.trajectories:
            traj_len = traj['observations'].shape[0]
            states.append(traj['observations'])
            # calculate returns to go and rescale them
            traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0) / rtg_scale

        # used for input normalization
        states = np.concatenate(states, axis=0)
        self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

        # normalize states
        for traj in self.trajectories:
            traj['observations'] = (traj['observations'] - self.state_mean) / self.state_std


class FixedReplayBuffer(object):
    """
    Overview:
        Object composed of a list of OutofGraphReplayBuffers.
    Interfaces:
        ``__init__``, ``get_transition_elements``, ``sample_transition_batch``
    """

    def __init__(self, data_dir, replay_suffix, *args, **kwargs):  # pylint: disable=keyword-arg-before-vararg
        """
        Overview:
            Initialize the FixedReplayBuffer class.
        Arguments:
            - data_dir (:obj:`str`): log Directory from which to load the replay buffer.
            - replay_suffix (:obj:`int`): If not None, then only load the replay buffer \
                corresponding to the specific suffix in data directory.
            - args (:obj:`list`): Arbitrary extra arguments.
            - kwargs (:obj:`dict`): Arbitrary keyword arguments.

        """

        self._args = args
        self._kwargs = kwargs
        self._data_dir = data_dir
        self._loaded_buffers = False
        self.add_count = np.array(0)
        self._replay_suffix = replay_suffix
        if not self._loaded_buffers:
            if replay_suffix is not None:
                assert replay_suffix >= 0, 'Please pass a non-negative replay suffix'
                self.load_single_buffer(replay_suffix)
            else:
                pass
            # self._load_replay_buffers(num_buffers=50)

    def load_single_buffer(self, suffix):
        """
        Overview:
            Load a single replay buffer.
        Arguments:
            - suffix (:obj:`int`): The suffix of the replay buffer.
        """

        replay_buffer = self._load_buffer(suffix)
        if replay_buffer is not None:
            self._replay_buffers = [replay_buffer]
            self.add_count = replay_buffer.add_count
            self._num_replay_buffers = 1
            self._loaded_buffers = True

    def _load_buffer(self, suffix):
        """
        Overview:
            Loads a OutOfGraphReplayBuffer replay buffer.
        Arguments:
            - suffix (:obj:`int`): The suffix of the replay buffer.
        """

        try:
            from dopamine.replay_memory import circular_replay_buffer
            STORE_FILENAME_PREFIX = circular_replay_buffer.STORE_FILENAME_PREFIX
            # pytype: disable=attribute-error
            replay_buffer = circular_replay_buffer.OutOfGraphReplayBuffer(*self._args, **self._kwargs)
            replay_buffer.load(self._data_dir, suffix)
            # pytype: enable=attribute-error
            return replay_buffer
        # except tf.errors.NotFoundError:
        except:
            raise ('can not load')

    def get_transition_elements(self):
        """
        Overview:
            Returns the transition elements.
        """

        return self._replay_buffers[0].get_transition_elements()

    def sample_transition_batch(self, batch_size=None, indices=None):
        """
        Overview:
            Returns a batch of transitions (including any extra contents).
        Arguments:
            - batch_size (:obj:`int`): The batch size.
            - indices (:obj:`list`): The indices of the batch.
        """

        buffer_index = np.random.randint(self._num_replay_buffers)
        return self._replay_buffers[buffer_index].sample_transition_batch(batch_size=batch_size, indices=indices)


class PCDataset(Dataset):
    """
    Overview:
        Dataset for Procedure Cloning.
    Interfaces:
        ``__init__``, ``__len__``, ``__getitem__``
    """

    def __init__(self, all_data):
        """
        Overview:
            Initialization method of PCDataset.
        Arguments:
            - all_data (:obj:`tuple`): The tuple of all data.
        """

        self._data = all_data

    def __getitem__(self, item):
        """
        Overview:
            Get the item of the dataset.
        Arguments:
            - item (:obj:`int`): The index of the dataset.
        """

        return {'obs': self._data[0][item], 'bfs_in': self._data[1][item], 'bfs_out': self._data[2][item]}

    def __len__(self):
        """
        Overview:
            Get the length of the dataset.
        """

        return self._data[0].shape[0]


def load_bfs_datasets(train_seeds=1, test_seeds=5):
    """
    Overview:
        Load BFS datasets.
    Arguments:
        - train_seeds (:obj:`int`): The number of train seeds.
        - test_seeds (:obj:`int`): The number of test seeds.
    """

    from dizoo.maze.envs import Maze

    def load_env(seed):
        ccc = easydict.EasyDict({'size': 16})
        e = Maze(ccc)
        e.seed(seed)
        e.reset()
        return e

    envs = [load_env(i) for i in range(train_seeds + test_seeds)]

    observations_train = []
    observations_test = []
    bfs_input_maps_train = []
    bfs_input_maps_test = []
    bfs_output_maps_train = []
    bfs_output_maps_test = []
    for idx, env in enumerate(envs):
        if idx < train_seeds:
            observations = observations_train
            bfs_input_maps = bfs_input_maps_train
            bfs_output_maps = bfs_output_maps_train
        else:
            observations = observations_test
            bfs_input_maps = bfs_input_maps_test
            bfs_output_maps = bfs_output_maps_test

        start_obs = env.process_states(env._get_obs(), env.get_maze_map())
        _, track_back = get_vi_sequence(env, start_obs)
        env_observations = torch.stack([track_back[i][0] for i in range(len(track_back))], dim=0)

        for i in range(env_observations.shape[0]):
            bfs_sequence, _ = get_vi_sequence(env, env_observations[i].numpy().astype(np.int32))  # [L, W, W]
            bfs_input_map = env.n_action * np.ones([env.size, env.size], dtype=np.long)

            for j in range(bfs_sequence.shape[0]):
                bfs_input_maps.append(torch.from_numpy(bfs_input_map))
                bfs_output_maps.append(torch.from_numpy(bfs_sequence[j]))
                observations.append(env_observations[i])
                bfs_input_map = bfs_sequence[j]

    train_data = PCDataset(
        (
            torch.stack(observations_train, dim=0),
            torch.stack(bfs_input_maps_train, dim=0),
            torch.stack(bfs_output_maps_train, dim=0),
        )
    )
    test_data = PCDataset(
        (
            torch.stack(observations_test, dim=0),
            torch.stack(bfs_input_maps_test, dim=0),
            torch.stack(bfs_output_maps_test, dim=0),
        )
    )

    return train_data, test_data


@DATASET_REGISTRY.register('bco')
class BCODataset(Dataset):
    """
    Overview:
        Dataset for Behavioral Cloning from Observation.
    Interfaces:
        ``__init__``, ``__len__``, ``__getitem__``
    Properties:
        - obs (:obj:`np.ndarray`): The observation array.
        - action (:obj:`np.ndarray`): The action array.
    """

    def __init__(self, data=None):
        """
        Overview:
            Initialization method of BCODataset.
        Arguments:
            - data (:obj:`dict`): The data dict.
        """

        if data is None:
            raise ValueError('Dataset can not be empty!')
        else:
            self._data = data

    def __len__(self):
        """
        Overview:
            Get the length of the dataset.
        """

        return len(self._data['obs'])

    def __getitem__(self, idx):
        """
        Overview:
            Get the item of the dataset.
        Arguments:
            - idx (:obj:`int`): The index of the dataset.
        """

        return {k: self._data[k][idx] for k in self._data.keys()}

    @property
    def obs(self):
        """
        Overview:
            Get the observation array.
        """

        return self._data['obs']

    @property
    def action(self):
        """
        Overview:
            Get the action array.
        """

        return self._data['action']


@DATASET_REGISTRY.register('diffuser_traj')
class SequenceDataset(torch.utils.data.Dataset):
    """
    Overview:
        Dataset for diffuser.
    Interfaces:
        ``__init__``, ``__len__``, ``__getitem__``
    """

    def __init__(self, cfg):
        """
        Overview:
            Initialization method of SequenceDataset.
        Arguments:
            - cfg (:obj:`dict`): The config dict.
        """

        import gym

        env_id = cfg.env.env_id
        data_path = cfg.policy.collect.get('data_path', None)
        env = gym.make(env_id)

        dataset = env.get_dataset()

        self.returns_scale = cfg.env.returns_scale
        self.horizon = cfg.env.horizon
        self.max_path_length = cfg.env.max_path_length
        self.discount = cfg.policy.learn.discount_factor
        self.discounts = self.discount ** np.arange(self.max_path_length)[:, None]
        self.use_padding = cfg.env.use_padding
        self.include_returns = cfg.env.include_returns
        self.env_id = cfg.env.env_id
        itr = self.sequence_dataset(env, dataset)
        self.n_episodes = 0

        fields = {}
        for k in dataset.keys():
            if 'metadata' in k:
                continue
            fields[k] = []
        fields['path_lengths'] = []

        for i, episode in enumerate(itr):
            path_length = len(episode['observations'])
            assert path_length <= self.max_path_length
            fields['path_lengths'].append(path_length)
            for key, val in episode.items():
                if key not in fields:
                    fields[key] = []
                if val.ndim < 2:
                    val = np.expand_dims(val, axis=-1)
                shape = (self.max_path_length, val.shape[-1])
                arr = np.zeros(shape, dtype=np.float32)
                arr[:path_length] = val
                fields[key].append(arr)
            if episode['terminals'].any() and cfg.env.termination_penalty and 'timeouts' in episode:
                assert not episode['timeouts'].any(), 'Penalized a timeout episode for early termination'
                fields['rewards'][-1][path_length - 1] += cfg.env.termination_penalty
            self.n_episodes += 1

        for k in fields.keys():
            fields[k] = np.array(fields[k])

        self.normalizer = DatasetNormalizer(fields, cfg.policy.normalizer, path_lengths=fields['path_lengths'])
        self.indices = self.make_indices(fields['path_lengths'], self.horizon)

        self.observation_dim = cfg.env.obs_dim
        self.action_dim = cfg.env.action_dim
        self.fields = fields
        self.normalize()
        self.normed = False
        if cfg.env.normed:
            self.vmin, self.vmax = self._get_bounds()
            self.normed = True

        # shapes = {key: val.shape for key, val in self.fields.items()}
        # print(f'[ datasets/mujoco ] Dataset fields: {shapes}')

    def sequence_dataset(self, env, dataset=None):
        """
        Overview:
            Sequence the dataset.
        Arguments:
            - env (:obj:`gym.Env`): The gym env.
        """

        import collections
        N = dataset['rewards'].shape[0]
        if 'maze2d' in env.spec.id:
            dataset = self.maze2d_set_terminals(env, dataset)
        data_ = collections.defaultdict(list)

        # The newer version of the dataset adds an explicit
        # timeouts field. Keep old method for backwards compatability.
        use_timeouts = 'timeouts' in dataset

        episode_step = 0
        for i in range(N):
            done_bool = bool(dataset['terminals'][i])
            if use_timeouts:
                final_timestep = dataset['timeouts'][i]
            else:
                final_timestep = (episode_step == env._max_episode_steps - 1)

            for k in dataset:
                if 'metadata' in k:
                    continue
                data_[k].append(dataset[k][i])

            if done_bool or final_timestep:
                episode_step = 0
                episode_data = {}
                for k in data_:
                    episode_data[k] = np.array(data_[k])
                if 'maze2d' in env.spec.id:
                    episode_data = self.process_maze2d_episode(episode_data)
                yield episode_data
                data_ = collections.defaultdict(list)

            episode_step += 1

    def maze2d_set_terminals(self, env, dataset):
        """
        Overview:
            Set the terminals for maze2d.
        Arguments:
            - env (:obj:`gym.Env`): The gym env.
            - dataset (:obj:`dict`): The dataset dict.
        """

        goal = env.get_target()
        threshold = 0.5

        xy = dataset['observations'][:, :2]
        distances = np.linalg.norm(xy - goal, axis=-1)
        at_goal = distances < threshold
        timeouts = np.zeros_like(dataset['timeouts'])

        # timeout at time t iff
        #      at goal at time t and
        #      not at goal at time t + 1
        timeouts[:-1] = at_goal[:-1] * ~at_goal[1:]

        timeout_steps = np.where(timeouts)[0]
        path_lengths = timeout_steps[1:] - timeout_steps[:-1]

        print(
            f'[ utils/preprocessing ] Segmented {env.spec.id} | {len(path_lengths)} paths | '
            f'min length: {path_lengths.min()} | max length: {path_lengths.max()}'
        )

        dataset['timeouts'] = timeouts
        return dataset

    def process_maze2d_episode(self, episode):
        """
        Overview:
            Process the maze2d episode, adds in `next_observations` field to episode.
        Arguments:
            - episode (:obj:`dict`): The episode dict.
        """

        assert 'next_observations' not in episode
        length = len(episode['observations'])
        next_observations = episode['observations'][1:].copy()
        for key, val in episode.items():
            episode[key] = val[:-1]
        episode['next_observations'] = next_observations
        return episode

    def normalize(self, keys=['observations', 'actions']):
        """
        Overview:
            Normalize the dataset, normalize fields that will be predicted by the diffusion model
        Arguments:
            - keys (:obj:`list`): The list of keys.
        """

        for key in keys:
            array = self.fields[key].reshape(self.n_episodes * self.max_path_length, -1)
            normed = self.normalizer.normalize(array, key)
            self.fields[f'normed_{key}'] = normed.reshape(self.n_episodes, self.max_path_length, -1)

    def make_indices(self, path_lengths, horizon):
        """
        Overview:
            Make indices for sampling from dataset. Each index maps to a datapoint.
        Arguments:
            - path_lengths (:obj:`np.ndarray`): The path length array.
            - horizon (:obj:`int`): The horizon.
        """

        indices = []
        for i, path_length in enumerate(path_lengths):
            max_start = min(path_length - 1, self.max_path_length - horizon)
            if not self.use_padding:
                max_start = min(max_start, path_length - horizon)
            for start in range(max_start):
                end = start + horizon
                indices.append((i, start, end))
        indices = np.array(indices)
        return indices

    def get_conditions(self, observations):
        """
        Overview:
            Get the conditions on current observation for planning.
        Arguments:
            - observations (:obj:`np.ndarray`): The observation array.
        """

        if 'maze2d' in self.env_id:
            return {'condition_id': [0, self.horizon - 1], 'condition_val': [observations[0], observations[-1]]}
        else:
            return {'condition_id': [0], 'condition_val': [observations[0]]}

    def __len__(self):
        """
        Overview:
            Get the length of the dataset.
        """

        return len(self.indices)

    def _get_bounds(self):
        """
        Overview:
            Get the bounds of the dataset.
        """

        print('[ datasets/sequence ] Getting value dataset bounds...', end=' ', flush=True)
        vmin = np.inf
        vmax = -np.inf
        for i in range(len(self.indices)):
            value = self.__getitem__(i)['returns'].item()
            vmin = min(value, vmin)
            vmax = max(value, vmax)
        print('✓')
        return vmin, vmax

    def normalize_value(self, value):
        """
        Overview:
            Normalize the value.
        Arguments:
            - value (:obj:`np.ndarray`): The value array.
        """

        # [0, 1]
        normed = (value - self.vmin) / (self.vmax - self.vmin)
        # [-1, 1]
        normed = normed * 2 - 1
        return normed

    def __getitem__(self, idx, eps=1e-4):
        """
        Overview:
            Get the item of the dataset.
        Arguments:
            - idx (:obj:`int`): The index of the dataset.
            - eps (:obj:`float`): The epsilon.
        """

        path_ind, start, end = self.indices[idx]

        observations = self.fields['normed_observations'][path_ind, start:end]
        actions = self.fields['normed_actions'][path_ind, start:end]
        done = self.fields['terminals'][path_ind, start:end]

        # conditions = self.get_conditions(observations)
        trajectories = np.concatenate([actions, observations], axis=-1)

        if self.include_returns:
            rewards = self.fields['rewards'][path_ind, start:]
            discounts = self.discounts[:len(rewards)]
            returns = (discounts * rewards).sum()
            if self.normed:
                returns = self.normalize_value(returns)
            returns = np.array([returns / self.returns_scale], dtype=np.float32)
            batch = {
                'trajectories': trajectories,
                'returns': returns,
                'done': done,
                'action': actions,
            }
        else:
            batch = {
                'trajectories': trajectories,
                'done': done,
                'action': actions,
            }

        batch.update(self.get_conditions(observations))
        return batch


def hdf5_save(exp_data, expert_data_path):
    """
    Overview:
        Save the data to hdf5.
    """

    try:
        import h5py
    except ImportError:
        import sys
        logging.warning("not found h5py package, please install it trough 'pip install h5py' ")
        sys.exit(1)
    dataset = dataset = h5py.File('%s_demos.hdf5' % expert_data_path.replace('.pkl', ''), 'w')
    dataset.create_dataset('obs', data=np.array([d['obs'].numpy() for d in exp_data]), compression='gzip')
    dataset.create_dataset('action', data=np.array([d['action'].numpy() for d in exp_data]), compression='gzip')
    dataset.create_dataset('reward', data=np.array([d['reward'].numpy() for d in exp_data]), compression='gzip')
    dataset.create_dataset('done', data=np.array([d['done'] for d in exp_data]), compression='gzip')
    dataset.create_dataset('next_obs', data=np.array([d['next_obs'].numpy() for d in exp_data]), compression='gzip')


def naive_save(exp_data, expert_data_path):
    """
    Overview:
        Save the data to pickle.
    """

    with open(expert_data_path, 'wb') as f:
        pickle.dump(exp_data, f)


def offline_data_save_type(exp_data, expert_data_path, data_type='naive'):
    """
    Overview:
        Save the offline data.
    """

    globals()[data_type + '_save'](exp_data, expert_data_path)


def create_dataset(cfg, **kwargs) -> Dataset:
    """
    Overview:
        Create dataset.
    """

    cfg = EasyDict(cfg)
    import_module(cfg.get('import_names', []))
    return DATASET_REGISTRY.build(cfg.policy.collect.data_type, cfg=cfg, **kwargs)