-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathminmax_Q_RL.py
129 lines (90 loc) · 3.81 KB
/
minmax_Q_RL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np
import random
from scipy.optimize import minimize
from plots import *
"""
we are modelling the game below:
###################
# (3, -3) (1, -1)#
# (2, -2) (4, -4)#
###################
"""
payoffs = [[3, 1],
[2, 4]]
class QAgent():
def __init__(self, explor, learning_rate, gamma, player_id):
self.explor = explor
self.learning_rate = learning_rate
self.P = np.array([0.5, 0.5])
self.Q = np.ones((2, 2))
self.V = 1.0
self.gamma = gamma
self.payoffs = np.array(payoffs) * -1 if player_id == 2 else np.array(payoffs)
self.player_id = player_id
def take_action(self):
if random.choices([0, 1], weights=(self.explor * 100, (1 - self.explor) * 100))[0] == 0:
return random.choice([0, 1])
else:
return random.choices([0, 1], weights=(self.P[0] * 100, self.P[1] * 100))[0]
def observe(self, action, opponent):
return self.payoffs[action][opponent] if self.player_id == 1 else self.payoffs[opponent][action]
def update_Q(self, action, opponent, reward):
if self.player_id == 2:
temp = action
action = opponent
opponent = temp
self.Q[action][opponent] = (1 - self.learning_rate) * self.Q[action][opponent] + \
self.learning_rate * (reward + self.gamma * self.V)
def update_P(self, opponent):
bnds = ((0., 1.), (0., 1.))
cons = ({'type': 'eq', 'fun': lambda x: 1.0 - np.sum(x)})
if self.player_id == 1:
f = lambda x: min(np.matmul(x.T,self.Q))
else:
f = lambda x: min(np.matmul(x.T,self.Q.T))
self.P = minimize(fun=lambda x: -f(x), x0=np.array([0., 0.]), constraints=cons, bounds=bnds).x
def update_V(self):
if self.player_id == 1:
f = lambda x: min(np.matmul(x.T,self.Q))
else:
f = lambda x: min(np.matmul(x.T,self.Q.T))
self.V = f(self.P)
def final_expected_payoff(agent1,agent2):
expected_util_action_11 = agent1.payoffs[0][0] * agent2.P[0] + agent1.payoffs[0][1] * agent2.P[1]
expected_util_action_12 = agent1.payoffs[1][0] * agent2.P[0] + agent1.payoffs[1][1] * agent2.P[1]
expected_util_action_21 = agent2.payoffs[0][0] * agent1.P[0] + agent2.payoffs[1][0] * agent1.P[1]
expected_util_action_22 = agent2.payoffs[0][1] * agent1.P[0] + agent2.payoffs[1][1] * agent1.P[1]
return expected_util_action_11*agent1.P[0] + expected_util_action_12*agent1.P[1], expected_util_action_21*agent2.P[0] + expected_util_action_22*agent2.P[1]
curr_episode = 0
total_num_of_episodes = 1000
agent1 = QAgent(explor=0.3, learning_rate=1.0, gamma=0.9, player_id=1)
agent2 = QAgent(explor=0.3, learning_rate=1.0, gamma=0.9, player_id=2)
policies = [[agent1.P,agent2.P]]
while curr_episode < total_num_of_episodes:
action1 = agent1.take_action()
action2 = agent2.take_action()
rew1 = agent1.observe(action=action1, opponent=action2)
rew2 = agent2.observe(action=action2, opponent=action1)
agent1.update_Q(action=action1, opponent=action2, reward=rew1)
agent1.update_P(opponent=action2)
agent1.update_V()
agent2.update_Q(action=action2, opponent=action1, reward=rew2)
agent2.update_P(opponent=action1)
agent2.update_V()
policies.append([agent1.P,agent2.P])
if curr_episode % 100 == 0:
agent1.learning_rate *= 0.8
agent2.learning_rate *= 0.8
curr_episode += 1
print("Agent's 1 Policy:")
print(agent1.P)
print("Agent's 2 Policy:")
print(agent2.P)
print("Agents' Expected Payoff:")
print(final_expected_payoff(agent1,agent2))
print("V:")
print(agent1.V*0.1, agent2.V*0.1)
print(agent2.V*0.1 + agent1.V*0.1)
policies = policies[:1000]
total_num_of_episodes = 1000
policy_iter(policies,total_num_of_episodes)