-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolicy_iteration.py
154 lines (118 loc) · 4.55 KB
/
policy_iteration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# policy_extraction.py
from policy import Policy
class PolicyIteration(Policy):
def __init__(self):
super().__init__()
self._config = None
self._policy_results = {}
pass
@property
def policy_results(self):
return self._policy_results
@property
def reward(self):
return
@reward.setter
def reward(self, reward):
self._reward = reward
##
# @param gamestate The current gamestate
# @return an Absolute direction
def best_move(self, gamestate):
results = self.policy_iteration(gamestate, 10)
scrambledMove = self.moveScrambler(results[1])
move = self.relativeToAbsoluteDirection(
gamestate.getAbsoluteHeadDirection(),
scrambledMove
)
return move
def policy(self):
return self.location_
##
# Accessor Method For Instance Variable: self.config
#
@property
def config(self):
return
##
# Mutator Method For Instance Variable: self.config
#
@config.setter
def config(self, config):
self._config = config
pass
##
# Accessor Method For Instance Variable: self.gamestate
# Note: returns a copy of the game state
#
@property
def gamestate(self):
return self._gamestate.copyGameState()
##
# Mutator Method For Instance Variable: self.gamestate
#
@reward.setter
def gamestate(self, gamestate):
self._gamestate = gamestate
##
# @param gamestate The current gamestate
# @param iteration The number of iterations to run
# @return a tuple containing value and absolute direction best move (value, move)
#
def policy_iteration(self, gamestate, iteration):
# Base Case
if iteration <= 0:
return 0, None
localRewardList = {}
highest_reward = float("-inf")
best_action = None
# Loop through Each Possible Action
# Calculate corresponding value
for action, possible_actions in self._config.stochastic.directions.items():
localReward = 0
localProbTotal = 0 # Used for Normalization
# Loop through each Possible Direction
for possible_action, probability in possible_actions.items():
localNewDirection = self.relativeToAbsoluteDirection(
gamestate.getAbsoluteHeadDirection(),
possible_action
)
possible_gamestate = gamestate.copyGameState()
# (x,y) = (localNewDirection[0], localNewDirection[1])
possible_gamestate.gameLogicIteration(localNewDirection[0], localNewDirection[1])
current_reward = self.rewardValue(possible_gamestate)
tempIteration = iteration
if probability == 0:
tempIteration = 0
if possible_gamestate.getGameEnd():
tempIteration = 0
# Dynamic Programming
# Use Cached values if it exists
# otherwise, Recursive Call
head = tuple(gamestate.getHeadLocation())
food = tuple(gamestate.getFoodLocation())
tail = tuple([tuple(list) for list in gamestate.getTailListLocation()])
key = ((head, food, tail))
if (key) in self.policy_results:
current_value = self.policy_results[key][0]
else:
current_value = self.policy_iteration(possible_gamestate, tempIteration - 1)[0]
localReward += probability * (current_reward + self._config.discount.gamma * current_value)
localProbTotal += probability
localReward /= localProbTotal # Normalization
# Find highest value & return it
reward_value = localReward
if reward_value > highest_reward:
highest_reward = reward_value
best_action = action
localRewardList[action] = localReward
# In the event all action have same value, GO Forward!
if localRewardList[best_action] == localRewardList["FORWARD"]:
best_action = "FORWARD"
# Cache Results
head = tuple(gamestate.getHeadLocation())
food = tuple(gamestate.getFoodLocation())
tail = tuple([tuple(list) for list in gamestate.getTailListLocation()])
key = ((head,food,tail))
self.policy_results[key] = (highest_reward, best_action)
return highest_reward, best_action