File size: 8,359 Bytes
079c32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Code Reference: https://github.com/OptMLGroup/DeepBeerInventory-RL.
import argparse
import numpy as np


# Here we want to define the agent class for the BeerGame
class Agent(object):
    # initializes the agents with initial values for IL, OO and saves self.agentNum for recognizing the agents.
    def __init__(
            self, agentNum: int, IL: int, AO: int, AS: int, c_h: float, c_p: float, eta: int, compuType: str,
            config: argparse.Namespace
    ) -> None:
        self.agentNum = agentNum
        self.IL = IL  # Inventory level of each agent - changes during the game
        self.OO = 0  # Open order of each agent - changes during the game
        self.ASInitial = AS  # the initial arriving shipment.
        self.ILInitial = IL  # IL at which we start each game with this number
        self.AOInitial = AO  # OO at which we start each game with this number
        self.config = config  # an instance of config is stored inside the class
        self.curState = []  # this function gets the current state of the game
        self.nextState = []
        self.curReward = 0  # the reward observed at the current step
        self.cumReward = 0  # cumulative reward; reset at the beginning of each episode
        self.totRew = 0  # it is reward of all players obtained for the current player.
        self.c_h = c_h  # holding cost
        self.c_p = c_p  # backorder cost
        self.eta = eta  # the total cost regulazer
        self.AS = np.zeros((1, 1))  # arriced shipment
        self.AO = np.zeros((1, 1))  # arrived order
        self.action = 0  # the action at time t
        self.compType = compuType
        # self.compTypeTrain = compuType  # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
        # self.compTypeTest = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
        self.alpha_b = self.config.alpha_b[self.agentNum]  # parameters for the formula
        self.betta_b = self.config.betta_b[self.agentNum]  # parameters for the formula
        if self.config.demandDistribution == 0:
            self.a_b = np.mean((self.config.demandUp, self.config.demandLow))  # parameters for the formula
            self.b_b = np.mean((self.config.demandUp, self.config.demandLow)) * (
                np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
                np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
            )  # parameters for the formula
        elif self.config.demandDistribution == 1 or self.config.demandDistribution == 3 or self.config.demandDistribution == 4:
            self.a_b = self.config.demandMu  # parameters for the formula
            self.b_b = self.config.demandMu * (
                np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
                np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
            )  # parameters for the formula
        elif self.config.demandDistribution == 2:
            self.a_b = 8  # parameters for the formula
            self.b_b = (3 / 4.) * 8 * (
                np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
                np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
            )  # parameters for the formula
        elif self.config.demandDistribution == 3:
            self.a_b = 10  # parameters for the formula
            self.b_b = 7 * (
                np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
                np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
            )  # parameters for the formula
        else:
            raise Exception('The demand distribution is not defined or it is not a valid type.!')

        self.hist = []  # this is used for plotting - keeps the history for only one game
        self.hist2 = []  # this is used for animation usage
        self.srdqnBaseStock = []  # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
        self.T = 0
        self.bsBaseStock = 0
        self.init_bsBaseStock = 0
        self.nextObservation = []

        if self.compType == 'srdqn':
            # sets the initial input of the network
            self.currentState = np.stack(
                [self.curState for _ in range(self.config.multPerdInpt)], axis=0
            )  # multPerdInpt observations stacked. each row is an observation

    # reset player information
    def resetPlayer(self, T: int):
        self.IL = self.ILInitial
        self.OO = 0
        self.AS = np.squeeze(
            np.zeros((1, T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10))
        )  # arriced shipment
        self.AO = np.squeeze(
            np.zeros((1, T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10))
        )  # arrived order
        if self.agentNum != 0:
            for i in range(self.config.leadRecOrderUp_aux[self.agentNum - 1]):
                self.AO[i] = self.AOInitial[self.agentNum - 1]
        for i in range(self.config.leadRecItemUp[self.agentNum]):
            self.AS[i] = self.ASInitial
        self.curReward = 0  # the reward observed at the current step
        self.cumReward = 0  # cumulative reward; reset at the begining of each episode
        self.action = []
        self.hist = []
        self.hist2 = []
        self.srdqnBaseStock = []  # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
        self.T = T
        self.curObservation = self.getCurState(1)  # this function gets the current state of the game
        self.nextObservation = []
        if self.compType == 'srdqn':
            self.currentState = np.stack([self.curObservation for _ in range(self.config.multPerdInpt)], axis=0)

    # updates the IL and OO at time t, after recieving "rec" number of items
    def recieveItems(self, time: int) -> None:
        self.IL = self.IL + self.AS[time]  # inverntory level update
        self.OO = self.OO - self.AS[time]  # invertory in transient update

    # find action Value associated with the action list
    def actionValue(self, curTime: int) -> int:
        if self.config.fixedAction:
            a = self.config.actionList[np.argmax(self.action)]
        else:
            # "d + x" rule
            if self.compType == 'srdqn':
                a = max(0, self.config.actionList[np.argmax(self.action)] * self.config.action_step + self.AO[curTime])
            elif self.compType == 'rnd':
                a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime])
            else:
                a = max(0, self.config.actionListOpt[np.argmax(self.action)])

        return a

    # getReward returns the reward at the current state
    def getReward(self) -> None:
        # cost (holding + backorder) for one time unit
        self.curReward = (self.c_p * max(0, -self.IL) + self.c_h * max(0, self.IL)) / 200.  # self.config.Ttest #
        self.curReward = -self.curReward
        # make reward negative, because it is the cost

        # sum total reward of each agent
        self.cumReward = self.config.gamma * self.cumReward + self.curReward

    # This function returns a np.array of the current state of the agent
    def getCurState(self, t: int) -> np.ndarray:
        if self.config.ifUseASAO:
            if self.config.if_use_AS_t_plus_1:
                curState = np.array(
                    [-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO, self.AS[t], self.AO[t]]
                )
            else:
                curState = np.array(
                    [-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO, self.AS[t - 1], self.AO[t]]
                )
        else:
            curState = np.array([-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO])

        if self.config.ifUseActionInD:
            a = self.config.actionList[np.argmax(self.action)]
            curState = np.concatenate((curState, np.array([a])))

        return curState