-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path1._Classical_QL.py
110 lines (88 loc) · 3.36 KB
/
1._Classical_QL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gym
from gym.envs.registration import register
import torch
from tqdm import tqdm
import numpy as np
from src.visualizations import *
import os
# Parameters
gamma = 0.05
epochs = 200
max_steps = 400
learning_rate = 0.001
random_chance = 0.99
random_scaling = 0.95
window = 40
target_win_ratio = 0.98
min_steps_num = 6
global_seed = 42
np.random.seed(global_seed)
torch.manual_seed(global_seed)
# register(
# id='FrozenLake-v1',
# entry_point='gym.envs.toy_text:FrozenLakeEnv',
# kwargs={'map_name' : '4x4',
# 'is_slippery': False})
env = gym.make('FrozenLake-v1', is_slippery=False)
#print(env.render(mode='ansi'))
#Initilize Q
number_of_states = env.observation_space.n
number_of_actions = env.action_space.n
print('number_of_states:', number_of_states,'\nnumber_of_actions' ,number_of_actions)
# At first Q is a zero tensor with action and observation space
Q = torch.zeros([number_of_states, number_of_actions])
steps_total = []
rewards_total = []
win_history = []
random_params = []
epoch_random_chance = random_chance
for i_episode in tqdm(range(epochs)):
state = env.reset()
reward_all = 0
epoch_random_chance*=random_scaling
for step in range(max_steps):
# action
if torch.rand(1) < epoch_random_chance:
Q_state = torch.rand(number_of_actions)
else:
Q_state = Q[state]
action = torch.argmax(Q_state)
#Take the best action
new_state, reward, done, info = env.step(action.item())
if reward==0:
if done==True:
reward=-1
# else:
# reward=-0.01
#Update Q and state
Q[state,action] = Q[state,action]+learning_rate*(reward + gamma * torch.max(Q[new_state])-Q[state,action])
state = new_state
reward_all += reward
#env.render()
if done or step==max_steps-1:
steps_total.append(step+1)
rewards_total.append(reward_all)
win_history.append(1 if reward==1. else 0)
random_params.append(epoch_random_chance)
break
if sum(win_history[-window:])/window>=target_win_ratio:
break
results_path = "../results/classical_QL"
strategy = np.array([torch.argmax(Q_state).item() for Q_state in Q]).reshape((4,4))
holes_indexes = np.array([5,7,11,12])
strategy_save_path = os.path.join(results_path, "trained_strategy.jpg")
plot_strategy(strategy, holes_indexes, strategy_save_path)
moving_average_history_save_path = os.path.join(results_path, "training_history_moving_average.jpg")
plot_rolling_window_history(steps_total, rewards_total, win_history, random_params, target_win_ratio, min_steps_num, moving_average_history_save_path, window=window)
history_save_path = os.path.join(results_path, "training_history.jpg")
plot_history(steps_total, rewards_total, win_history, random_params, target_win_ratio, min_steps_num, history_save_path)
with open(os.path.join(results_path, "hyperparameters.txt"), "w+") as f:
f.write(f'gamma;{gamma}\n')
f.write(f'epochs;{epochs}\n')
f.write(f'max_steps;{max_steps}\n')
f.write(f'learning_rate;{learning_rate}\n')
f.write(f'random_chance;{random_chance}\n')
f.write(f'random_scaling;{random_scaling}\n')
f.write(f'window;{window}\n')
f.write(f'target_win_ratio;{target_win_ratio}\n')
f.write(f'min_steps_num;{min_steps_num}\n')