-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreinforcement_learning_attack.py
92 lines (72 loc) · 3.73 KB
/
reinforcement_learning_attack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import logging
import os
import sys
import time
import chainer
import chainerrl
import gym
import numpy as np
# noinspection PyUnresolvedReferences
import gym_env
# *** Settings ***
challenge_bit_length = 16
arbiter_seed = 1337
M_delay_granularity = 2
evaluation_interval = 10 ** 4
outdir = 'result_x' + str(challenge_bit_length) + '_M' + str(M_delay_granularity)
if not os.path.exists(outdir):
os.makedirs(outdir)
env = gym.make('gym_env/ArbiterPufDelayII-v0', challenge_bit_length=challenge_bit_length, arbiter_seed=arbiter_seed,
M_delay_granularity=M_delay_granularity)
print('observation space:', env.observation_space)
print('action space:', env.action_space)
obs = env.reset()
print('initial observation:', obs)
action = env.action_space.sample()
print("Action test:", action)
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)
print(f'{env.observation_space=}')
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
print('observation size:', obs_size)
print('num of actions:', n_actions)
# Set up the logger to print info messages for understandability.
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
q_func = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction(obs_size, n_actions, n_hidden_layers=2,
n_hidden_channels=64)
# Uncomment to use CUDA
# q_func.to_gpu(0)
# Use Adam to optimize q_func. eps=1e-2 is for stability.
optimizer = chainer.optimizers.Adam(eps=1e-3)
optimizer.setup(q_func)
# Set the discount factor that discounts future rewards.
gamma = 0.95
# Use epsilon-greedy for exploration
# explorer = chainerrl.explorers.ConstantEpsilonGreedy(epsilon=0.3, random_action_func=env.action_space.sample)
# Use epsilon-greedy for exploration
explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(start_epsilon=1.0, end_epsilon=0.1, decay_steps=10 ** 2,
random_action_func=env.action_space.sample)
# Draw the computational graph and save it in the output directory.
chainerrl.misc.draw_computational_graph([q_func(np.zeros_like(obs, dtype=np.float32)[None])],
os.path.join(outdir, 'model-' + time.strftime("%Y%m%d-%H%M%S")))
# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=5 * 10 ** 3)
# Since observations from CartPole-v0 is numpy.float64 while
# Chainer only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
phi = lambda x: x.astype(np.float32, copy=False)
# Now create an agent that will interact with the environment.
agent = chainerrl.agents.DoubleDQN(q_func, optimizer, replay_buffer, gamma, explorer, replay_start_size=1000,
update_interval=1, target_update_interval=500, phi=phi) # add gpu=0 to use gpu
chainerrl.experiments.train_agent_with_evaluation(agent, env, steps=100000000, # Train the agent for 10 mio steps
eval_n_steps=None, # We evaluate for episodes, not time
eval_n_episodes=10000, # 10000 challenges sampled for each evaluation
train_max_episode_len=200, # Maximum length of each episode
eval_interval=evaluation_interval, # Evaluate the agent after x steps
successful_score=0.99, # early stopping if mean is > 99%
outdir=outdir) # Save everything to 'result' directory