Spaces:

zjowowen
/

gomoku

Sleeping

File size: 8,359 Bytes

079c32c

# Code Reference: https://github.com/OptMLGroup/DeepBeerInventory-RL.
import argparse
import numpy as np


# Here we want to define the agent class for the BeerGame
class Agent(object):
    # initializes the agents with initial values for IL, OO and saves self.agentNum for recognizing the agents.
    def __init__(
            self, agentNum: int, IL: int, AO: int, AS: int, c_h: float, c_p: float, eta: int, compuType: str,
            config: argparse.Namespace
    ) -> None:
        self.agentNum = agentNum
        self.IL = IL  # Inventory level of each agent - changes during the game
        self.OO = 0  # Open order of each agent - changes during the game
        self.ASInitial = AS  # the initial arriving shipment.
        self.ILInitial = IL  # IL at which we start each game with this number
        self.AOInitial = AO  # OO at which we start each game with this number
        self.config = config  # an instance of config is stored inside the class
        self.curState = []  # this function gets the current state of the game
        self.nextState = []
        self.curReward = 0  # the reward observed at the current step
        self.cumReward = 0  # cumulative reward; reset at the beginning of each episode
        self.totRew = 0  # it is reward of all players obtained for the current player.
        self.c_h = c_h  # holding cost
        self.c_p = c_p  # backorder cost
        self.eta = eta  # the total cost regulazer
        self.AS = np.zeros((1, 1))  # arriced shipment
        self.AO = np.zeros((1, 1))  # arrived order
        self.action = 0  # the action at time t
        self.compType = compuType
        # self.compTypeTrain = compuType  # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
        # self.compTypeTest = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
        self.alpha_b = self.config.alpha_b[self.agentNum]  # parameters for the formula
        self.betta_b = self.config.betta_b[self.agentNum]  # parameters for the formula
        if self.config.demandDistribution == 0:
            self.a_b = np.mean((self.config.demandUp, self.config.demandLow))  # parameters for the formula
            self.b_b = np.mean((self.config.demandUp, self.config.demandLow)) * (
                np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
                np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
            )  # parameters for the formula
        elif self.config.demandDistribution == 1 or self.config.demandDistribution == 3 or self.config.demandDistribution == 4:
            self.a_b = self.config.demandMu  # parameters for the formula
            self.b_b = self.config.demandMu * (
                np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
                np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
            )  # parameters for the formula
        elif self.config.demandDistribution == 2:
            self.a_b = 8  # parameters for the formula
            self.b_b = (3 / 4.) * 8 * (
                np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
                np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
            )  # parameters for the formula
        elif self.config.demandDistribution == 3:
            self.a_b = 10  # parameters for the formula
            self.b_b = 7 * (
                np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
                np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
            )  # parameters for the formula
        else:
            raise Exception('The demand distribution is not defined or it is not a valid type.!')

        self.hist = []  # this is used for plotting - keeps the history for only one game
        self.hist2 = []  # this is used for animation usage
        self.srdqnBaseStock = []  # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
        self.T = 0
        self.bsBaseStock = 0
        self.init_bsBaseStock = 0
        self.nextObservation = []

        if self.compType == 'srdqn':
            # sets the initial input of the network
            self.currentState = np.stack(
                [self.curState for _ in range(self.config.multPerdInpt)], axis=0
            )  # multPerdInpt observations stacked. each row is an observation

    # reset player information
    def resetPlayer(self, T: int):
        self.IL = self.ILInitial
        self.OO = 0
        self.AS = np.squeeze(
            np.zeros((1, T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10))
        )  # arriced shipment
        self.AO = np.squeeze(
            np.zeros((1, T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10))
        )  # arrived order
        if self.agentNum != 0:
            for i in range(self.config.leadRecOrderUp_aux[self.agentNum - 1]):
                self.AO[i] = self.AOInitial[self.agentNum - 1]
        for i in range(self.config.leadRecItemUp[self.agentNum]):
            self.AS[i] = self.ASInitial
        self.curReward = 0  # the reward observed at the current step
        self.cumReward = 0  # cumulative reward; reset at the begining of each episode
        self.action = []
        self.hist = []
        self.hist2 = []
        self.srdqnBaseStock = []  # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
        self.T = T
        self.curObservation = self.getCurState(1)  # this function gets the current state of the game
        self.nextObservation = []
        if self.compType == 'srdqn':
            self.currentState = np.stack([self.curObservation for _ in range(self.config.multPerdInpt)], axis=0)

    # updates the IL and OO at time t, after recieving "rec" number of items
    def recieveItems(self, time: int) -> None:
        self.IL = self.IL + self.AS[time]  # inverntory level update
        self.OO = self.OO - self.AS[time]  # invertory in transient update

    # find action Value associated with the action list
    def actionValue(self, curTime: int) -> int:
        if self.config.fixedAction:
            a = self.config.actionList[np.argmax(self.action)]
        else:
            # "d + x" rule
            if self.compType == 'srdqn':
                a = max(0, self.config.actionList[np.argmax(self.action)] * self.config.action_step + self.AO[curTime])
            elif self.compType == 'rnd':
                a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime])
            else:
                a = max(0, self.config.actionListOpt[np.argmax(self.action)])

        return a

    # getReward returns the reward at the current state
    def getReward(self) -> None:
        # cost (holding + backorder) for one time unit
        self.curReward = (self.c_p * max(0, -self.IL) + self.c_h * max(0, self.IL)) / 200.  # self.config.Ttest #
        self.curReward = -self.curReward
        # make reward negative, because it is the cost

        # sum total reward of each agent
        self.cumReward = self.config.gamma * self.cumReward + self.curReward

    # This function returns a np.array of the current state of the agent
    def getCurState(self, t: int) -> np.ndarray:
        if self.config.ifUseASAO:
            if self.config.if_use_AS_t_plus_1:
                curState = np.array(
                    [-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO, self.AS[t], self.AO[t]]
                )
            else:
                curState = np.array(
                    [-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO, self.AS[t - 1], self.AO[t]]
                )
        else:
            curState = np.array([-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO])

        if self.config.ifUseActionInD:
            a = self.config.actionList[np.argmax(self.action)]
            curState = np.concatenate((curState, np.array([a])))

        return curState