Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

For CartPole in Chapter 4 This is the code that worked for me. #71

Open
jupitermarketingagency opened this issue Dec 7, 2023 · 3 comments

Comments

@jupitermarketingagency
Copy link

jupitermarketingagency commented Dec 7, 2023

#!/usr/bin/env python3
import gymnasium as gym
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

class Net(nn.Module):
def init(self, obs_size, hidden_size, n_actions):
super(Net, self).init()
self.net = nn.Sequential(
nn.Linear(obs_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, n_actions)
)

def forward(self, x):
    return self.net(x)

Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(env, net, batch_size):
batch = []
episode_reward = 0.0
episode_steps = []
obs, _ = env.reset()
env.render()
sm = nn.Softmax(dim=1)
while True:
obs_v = torch.FloatTensor([obs])
act_probs_v = sm(net(obs_v))
act_probs = act_probs_v.data.numpy()[0]
action = np.random.choice(len(act_probs), p=act_probs)
next_obs, reward, is_done, _, _ = env.step(action)
episode_reward += reward
step = EpisodeStep(observation=obs, action=action)
episode_steps.append(step)

    if is_done:
        e = Episode(reward=episode_reward, steps=episode_steps)
        batch.append(e)
        episode_reward = 0.0
        episode_steps = []
        next_obs, _ = env.reset()
        if len(batch) == batch_size:
            yield batch
            batch = []
    obs = next_obs

def filter_batch(batch, percentile):
rewards = list(map(lambda s: s.reward, batch))
reward_bound = np.percentile(rewards, percentile)
reward_mean = float(np.mean(rewards))

train_obs = []
train_act = []
for reward, steps in batch:
    if reward < reward_bound:
        continue
    train_obs.extend(map(lambda step: step.observation, steps))
    train_act.extend(map(lambda step: step.action, steps))

train_obs_v = torch.FloatTensor(train_obs)
train_act_v = torch.LongTensor(train_act)
return train_obs_v, train_act_v, reward_bound, reward_mean

if name == "main":
env = gym.make("CartPole-v1", render_mode='human')
#env = gym.wrappers.Monitor(env, directory="mon", force=True)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)
writer = SummaryWriter(comment="-cartpole")

for iter_no, batch in enumerate(iterate_batches(
        env, net, BATCH_SIZE)):
    obs_v, acts_v, reward_b, reward_m = \
        filter_batch(batch, PERCENTILE)
    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, acts_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
        iter_no, loss_v.item(), reward_m, reward_b))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    if reward_m > 199:
        print("Solved!")
        break
writer.close()
@dkinneyBU
Copy link

@jupitermarketingagency OH MY GOOOOOOODDDD! Thank you for this, I've been fighting this stupid program for THREE DAYS! This guy really needs to revisit this code, I've had to debug basically all of it with a few rare exceptions. And this is only Chapter 4!!!!

You are a life saver, if you conjure up any more fixes please post--I will be internally grateful. :-)

@jupitermarketingagency
Copy link
Author

@dkinneyBU Glad to hear that was of help to you. Yes, agree with you about him revisiting this code. So far from all the RL courses we've seen this happen over and over again because the books are more than 2 years old. So we've been trying to only focus on books that have been recently published.

@MFKruger
Copy link

Thank you very much! Being a newbie to PyTorch and DRL, you saved me a lot time!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Development

No branches or pull requests

3 participants