Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add road regeneration and pretraining #9

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions algos/custom_ppo2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,17 @@ class PPO2WithVAE(PPO2):
Notable changes:
- optimization is done after each episode and not after n steps
"""
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2"):
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1,
tb_log_name="PPO2", reset_num_timesteps=True):
# Transform to callable if needed
self.learning_rate = get_schedule_fn(self.learning_rate)
self.cliprange = get_schedule_fn(self.cliprange)
cliprange_vf = get_schedule_fn(self.cliprange_vf)

with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
new_tb_log = self._init_num_timesteps(reset_num_timesteps)


with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer:
self._setup_learn(seed)

runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
Expand All @@ -39,6 +44,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
frac = 1.0 - timestep / total_timesteps
lr_now = self.learning_rate(frac)
cliprangenow = self.cliprange(frac)
cliprange_vf_now = cliprange_vf(frac)
# true_reward is the reward without discount
obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
n_timesteps += len(obs)
Expand All @@ -49,13 +55,11 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
for epoch_num in range(self.noptepochs):
np.random.shuffle(inds)
for start in range(0, self.n_batch, batch_size):
# timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) //
# batch_size)
end = start + batch_size
mbinds = inds[start:end]
slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer,
update=n_timesteps))
update=n_timesteps, cliprange_vf=cliprange_vf_now))
else: # recurrent version
assert self.n_envs % self.nminibatches == 0
env_indices = np.arange(self.n_envs)
Expand Down Expand Up @@ -89,8 +93,9 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
logger.logkv("total_timesteps", n_timesteps)
logger.logkv("fps", fps)
logger.logkv("explained_variance", float(explained_var))
logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
logger.logkv('time_elapsed', t_start - t_first_start)
for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
logger.logkv(loss_name, loss_val)
Expand Down
34 changes: 24 additions & 10 deletions algos/custom_sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class SACWithVAE(SAC):
def optimize(self, step, writer, current_lr):
"""
Do several optimization steps to update the different networks.

:param step: (int) current timestep
:param writer: (TensorboardWriter object)
:param current_lr: (float) Current learning rate
Expand All @@ -46,17 +46,23 @@ def optimize(self, step, writer, current_lr):
return mb_infos_vals

def learn(self, total_timesteps, callback=None, seed=None,
log_interval=1, tb_log_name="SAC", print_freq=100):
log_interval=1, tb_log_name="SAC", print_freq=100, reset_num_timesteps=True):

new_tb_log = self._init_num_timesteps(reset_num_timesteps)

with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer:

self._setup_learn(seed)

# Transform to callable if needed
self.learning_rate = get_schedule_fn(self.learning_rate)
# Initial learning rate
current_lr = self.learning_rate(1)

start_time = time.time()
episode_rewards = [0.0]
if self.action_noise is not None:
self.action_noise.reset()
is_teleop_env = hasattr(self.env, "wait_for_teleop_reset")
# TeleopEnv
if is_teleop_env:
Expand Down Expand Up @@ -85,13 +91,18 @@ def learn(self, total_timesteps, callback=None, seed=None,

# Before training starts, randomly sample actions
# from a uniform distribution for better exploration.
# Afterwards, use the learned policy.
if step < self.learning_starts:
action = self.env.action_space.sample()
# Afterwards, use the learned policy
# if random_exploration is set to 0 (normal setting)
if (step < self.learning_starts
or np.random.rand() < self.random_exploration):
# No need to rescale when sampling random action
rescaled_action = action
rescaled_action = action = self.env.action_space.sample()
else:
action = self.policy_tf.step(obs[None], deterministic=False).flatten()
# Add noise to the action (improve exploration,
# not needed in general)
if self.action_noise is not None:
action = np.clip(action + self.action_noise(), -1, 1)
# Rescale from [-1, 1] to the correct bounds
rescaled_action = action * np.abs(self.action_space.low)

Expand Down Expand Up @@ -127,6 +138,8 @@ def learn(self, total_timesteps, callback=None, seed=None,

episode_rewards[-1] += reward
if done:
if self.action_noise is not None:
self.action_noise.reset()
if not (isinstance(self.env, VecEnv) or is_teleop_env):
obs = self.env.reset()

Expand All @@ -149,13 +162,14 @@ def learn(self, total_timesteps, callback=None, seed=None,
else:
mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

num_episodes = len(episode_rewards)
num_episodes = len(episode_rewards) - 1
if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
fps = int(step / (time.time() - start_time))
logger.logkv("episodes", num_episodes)
logger.logkv("mean 100 episode reward", mean_reward)
logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
logger.logkv("n_updates", self.n_updates)
logger.logkv("current_lr", current_lr)
logger.logkv("fps", fps)
Expand Down
6 changes: 1 addition & 5 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,13 @@
# Number of past commands to concatenate with the input
N_COMMAND_HISTORY = 20
# Max cross track error (used in normal mode to reset the car)
MAX_CTE_ERROR = 2.0
# Level to use for training
LEVEL = 0
MAX_CTE_ERROR = 3.0

# Action repeat
FRAME_SKIP = 1
Z_SIZE = 512 # Only used for random features
TEST_FRAME_SKIP = 1

BASE_ENV = "DonkeyVae-v0"
ENV_ID = "DonkeyVae-v0-level-{}".format(LEVEL)
# Params that are logged
SIM_PARAMS = ['MIN_THROTTLE', 'MAX_THROTTLE', 'FRAME_SKIP',
'MAX_CTE_ERROR', 'N_COMMAND_HISTORY', 'MAX_STEERING_DIFF']
Expand Down
37 changes: 6 additions & 31 deletions donkey_gym/__init__.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,6 @@
from gym.envs.registration import register

register(
id='DonkeyVae-v0',
entry_point='donkey_gym.vae_env.vae_env:DonkeyVAEEnv',
timestep_limit=None,
)
#
# register(
# id='donkey-generated-roads-v0',
# entry_point='donkey_gym.vae_env:GeneratedRoadsEnv',
# timestep_limit=2000,
# )
#
# register(
# id='donkey-warehouse-v0',
# entry_point='donkey_gym.vae_env:WarehouseEnv',
# timestep_limit=2000,
# )
#
# register(
# id='donkey-avc-sparkfun-v0',
# entry_point='donkey_gym.vae_env:AvcSparkfunEnv',
# timestep_limit=2000,
# )
#
# register(
# id='donkey-generated-track-v0',
# entry_point='donkey_gym.vae_env:GeneratedTrackEnv',
# timestep_limit=2000,
# )
from gym.envs.registration import register

register(
id='DonkeyVae-v0',
entry_point='donkey_gym.envs.vae_env:DonkeyVAEEnv',
)
46 changes: 42 additions & 4 deletions donkey_gym/envs/donkey_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,24 @@ class DonkeyUnitySimContoller:
:param level: (int) Level index
:param port: (int) Port to use for communicating with the simulator
:param max_cte_error: (float) Max cross track error before reset
:param road_style: (int)
:param seed: (int)
:param turn_increment: (float)
"""

def __init__(self, level, port=9090, max_cte_error=3.0):
def __init__(self, level, port=9090, max_cte_error=3.0,
road_style=0, seed=0, turn_increment=1.0):

self.level = level
self.verbose = False

# sensor size - height, width, depth
self.camera_img_size = INPUT_DIM

self.address = ('0.0.0.0', port)

# Socket message handler
self.handler = DonkeyUnitySimHandler(level, max_cte_error=max_cte_error)
self.handler = DonkeyUnitySimHandler(level, max_cte_error=max_cte_error,
road_style=road_style, seed=seed, turn_increment=turn_increment)
# Create the server to which the unity sim will connect
self.server = SimServer(self.address, self.handler)
# Start the Asynchronous socket handler thread
Expand All @@ -54,10 +59,17 @@ def wait_until_loaded(self):
print("Waiting for sim to start..."
"if the simulation is running, press EXIT to go back to the menu")
time.sleep(3.0)
self.regen_road()

def reset(self):
self.handler.reset()

def regen_road(self):
self.handler.send_regen_road()

def seed(self, seed):
self.handler.seed = seed

def get_sensor_size(self):
"""
:return: (int, int, int)
Expand Down Expand Up @@ -92,15 +104,23 @@ class DonkeyUnitySimHandler(IMesgHandler):

:param level: (int) Level ID
:param max_cte_error: (float) Max cross track error before reset
:param road_style: (int)
:param seed: (int)
:param turn_increment: (float)
"""

def __init__(self, level, max_cte_error=3.0):
def __init__(self, level, max_cte_error=3.0,
road_style=0, seed=0, turn_increment=1.0):
self.level_idx = level
self.sock = None
self.loaded = False
self.verbose = False
self.timer = FPSTimer(verbose=0)
self.max_cte_error = max_cte_error
# Road characteristic
self.road_style = road_style
self.seed = seed
self.turn_increment = turn_increment

# sensor size - height, width, depth
self.camera_img_size = INPUT_DIM
Expand Down Expand Up @@ -303,6 +323,24 @@ def on_recv_scene_names(self, data):
print("SceneNames:", names)
self.send_load_scene(names[self.level_idx])

def send_regen_road(self):
"""
Regenerate the road, where available. For now only in level 0.
In level 0 there are currently 5 road styles. This changes the texture on the road
and also the road width.
The rand_seed can be used to get some determinism in road generation.
The turn_increment defaults to 1.0 internally. Provide a non zero positive float
to affect the curviness of the road. Smaller numbers will provide more shallow curves.
"""
msg = {
'msg_type' : 'regen_road',
'road_style': int(self.road_style).__str__(),
'rand_seed': int(self.seed).__str__(),
'turn_increment': self.turn_increment.__str__()
}
print("Regen road, road_style={}, seed={}".format(self.road_style, self.seed))
self.queue_message(msg)

def send_control(self, steer, throttle):
"""
Send message to the server for controlling the car.
Expand Down
Loading