-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalue_iteration.py
142 lines (124 loc) · 3.9 KB
/
value_iteration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
__author__ = 'nick'
# Function that performs value iteration. Input is an action transition probability matrix and a reward matrix
import numpy as np
DEBUG = False
def vi(P, R, gamma=None, theta=None):
# Initialisation
if gamma is None:
gamma = 0.9
if theta is None:
theta = 0.0001
num_states = P.shape[0]
num_actions = P.shape[1]
# Calculate the values
V = np.zeros(num_states)
while True:
delta = 0
oldV = V
for i in range(num_states):
v = V[i]
V[i] = 0
for j in range(num_actions):
next_states = np.argwhere(P[i,j,:]>0).flatten().tolist()
tmp_val = 0
for state in next_states:
tmp_val += P[i, j, state] * (R[i, j, state] + gamma * oldV[state])
if tmp_val > V[i]:
V[i] = tmp_val
if delta < abs(v - V[i]):
delta = abs(v - V[i])
if delta < theta:
break
if DEBUG:
print(V)
# Calculate the policy
policy = np.zeros((num_states, num_actions))
for i in range(num_states):
values = np.zeros(num_actions)
for j in range(num_actions):
next_states = np.argwhere(P[i,j,:]>0).flatten().tolist()
for state in next_states:
values[j] += P[i, j, state] * (R[i, j, state] + gamma * V[state])
max_val = np.amax(values)
idx = np.argwhere(values == max_val).flatten().tolist()
prob = 1.0/len(idx)
for k in range(len(idx)):
policy[i, idx[k]] = prob
if DEBUG:
print(policy)
return policy
def main():
P = np.zeros((16, 4, 16))
for i in range(4): # For each action
for j in range(4):
for k in range(4):
if i == 0:
if j == 0:
nextj = j
else:
nextj = j - 1
nextk = k
if i == 1:
if j == 3:
nextj = j
else:
nextj = j + 1
nextk = k
if i == 2:
if k == 3:
nextk = k
else:
nextk = k + 1
nextj = j
if i == 3:
if k == 0:
nextk = k
else:
nextk = k - 1
nextj = j
P[j*4+k,i,nextj*4+nextk] = 1
goal = (3, 3)
R = np.zeros((16, 4, 16))
for i in range(4): # For each action
for j in range(4):
for k in range(4):
if i == 0:
if j == 0:
nextj = j
else:
nextj = j - 1
nextk = k
if i == 1:
if j == 3:
nextj = j
else:
nextj = j + 1
nextk = k
if i == 2:
if k == 3:
nextk = k
else:
nextk = k + 1
nextj = j
if i == 3:
if k == 0:
nextk = k
else:
nextk = k - 1
nextj = j
if nextj == goal[0] and nextk == goal[1]:
R[j*4+k, i, nextj*4+nextk] = 100
else:
R[j*4+k, i, nextj*4+nextk] = -1
print(P)
print(R)
policy = vi(P, R)
print(policy[0])
max_prob = np.amax(policy[0])
idx = np.argwhere(policy[14] == max_prob).flatten().tolist()
if len(idx) == 1:
print(idx[0])
else:
print("MORE THAN ONE INDEXES")
if __name__ == "__main__":
main()