import numpy as np class QLearningAgent: def __init__(self, env) -> None: self.env = env self.q_table = self.build_q_table(env.observation_space.n, env.action_space.n) def build_q_table(self, n_states, n_actions): return np.zeros((n_states, n_actions)) def epsilon_greedy_policy(self, state, epsilon): # Epsilon probability of taking a random action or the # action that has the highest Q value for the current state if np.random.random() < epsilon: return np.random.choice(self.env.action_space.n) return np.argmax(self.q_table[state]) def greedy_policy(self, state): return np.argmax(self.q_table[state]) def update_q_table(self, state, action, reward, gamma, learning_rate, new_state): # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)] current_q = self.q_table[state][action] next_max_q = np.max(self.q_table[new_state]) self.q_table[state][action] = current_q + learning_rate * ( reward + gamma * next_max_q - current_q )