import numpy as np
import random
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.models.modelv2 import restore_original_dimensions


class HeuristicBase(Policy):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()

    def learn_on_batch(self, samples):
        pass

    @override(Policy)
    def get_weights(self):
        """No weights to save."""
        return {}

    @override(Policy)
    def set_weights(self, weights):
        """No weights to set."""
        pass

    @override(Policy)
    def compute_actions(
        self,
        obs_batch,
        state_batches=None,
        prev_action_batch=None,
        prev_reward_batch=None,
        info_batch=None,
        episodes=None,
        **kwargs
    ):
        obs_batch = restore_original_dimensions(
            np.array(obs_batch, dtype=np.float32), self.observation_space, tensorlib=np
        )
        return self._do_compute_actions(obs_batch)

    def pick_legal_action(self, legal_action):
        legal_choices = np.arange(len(legal_action))[legal_action == 1]
        return np.random.choice(legal_choices)


class AlwaysSameHeuristic(HeuristicBase):
    """
    Pick a random move and stick with it for the entire episode.
    """

    _rand_choice = random.choice(range(7))

    def _do_compute_actions(self, obs_batch):
        def select_action(legal_action):
            legal_choices = np.arange(len(legal_action))[legal_action == 1]

            if self._rand_choice not in legal_choices:
                self._rand_choice = np.random.choice(legal_choices)

            return self._rand_choice

        return [select_action(x) for x in obs_batch["action_mask"]], [], {}


class LinearHeuristic(HeuristicBase):
    """
    Pick a random move and increment column index
    """

    _rand_choice = random.choice(range(7))
    _rand_sign = np.random.choice([-1, 1])

    def _do_compute_actions(self, obs_batch):
        def select_action(legal_action):
            legal_choices = np.arange(len(legal_action))[legal_action == 1]

            self._rand_choice += 1 * self._rand_sign

            if self._rand_choice not in legal_choices:
                self._rand_choice = np.random.choice(legal_choices)

            return self._rand_choice

        return [select_action(x) for x in obs_batch["action_mask"]], [], {}


class BeatLastHeuristic(HeuristicBase):
    """
    Play the move the last move of the opponent.
    """

    def _do_compute_actions(self, obs_batch):
        def select_action(legal_action, observation):
            legal_choices = np.arange(len(legal_action))[legal_action == 1]

            obs_sums = np.sum(observation, axis=0)

            desired_actions = np.squeeze(np.argwhere(obs_sums[:, 0] < obs_sums[:, 1]))
            if desired_actions.size == 0:
                return np.random.choice(legal_choices)

            if desired_actions.size == 1:
                desired_action = desired_actions[()]
            else:
                desired_action = np.random.choice(desired_actions)
            if desired_action in legal_choices:
                return desired_action

            return np.random.choice(legal_choices)

        return (
            [
                select_action(x, y)
                for x, y in zip(obs_batch["action_mask"], obs_batch["observation"])
            ],
            [],
            {},
        )


class RandomHeuristic(HeuristicBase):
    """
    Just pick a random legal action
    The outputted state of the environment needs to be a dictionary with an
    'action_mask' key containing the legal actions for the agent.
    """

    def _do_compute_actions(self, obs_batch):
        return [self.pick_legal_action(x) for x in obs_batch["action_mask"]], [], {}