I recently started to learn the DDOG algorithm of reinforcement learning, and I tried to solve the MountainCarContinuous-v0 problem.
I used the tuned parameter of RL ZOO whenever possible,but it dosen't work.Who can help me find out where the problem is and get it trained up on my code base?
reslut
enter image description herepolicy loss
enter image description herecritic loss
import gymnasium as gym
import numpy as np
import torch.nn as nn
from torch.optim import Adam
import torch
from torch.functional import F
from stable_baselines3mon.noise import OrnsteinUhlenbeckActionNoise
from tensorboardX import SummaryWriter
from copy import deepcopy
logdir = "logs/scalars/"
file_writer = SummaryWriter(log_dir=logdir)
class Agent(nn.Module):
def __init__(self):
super().__init__()
self.l1=nn.Sequential(
nn.Linear(2, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, 1),
nn.Tanh()
)
def forward(self, obs):
return self.l1(obs)
class Qfunction(nn.Module):
def __init__(self):
super().__init__()
self.l1=nn.Sequential(
nn.Linear(3, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, 1),
)
def forward(self, obs,action):
input=torch.cat((obs,action),-1)
return self.l1(input)
class DDPG():
def __init__(self):
self.total_num_step=100000
self.device = 'cuda'
self.policy = Agent().to(self.device)
self.policy_target = deepcopy(self.policy).to(self.device)
self.tau=0.005
self.Q = Qfunction().to(self.device)
self.Q_target = deepcopy(self.Q).to(self.device)
self.Policyoptimizer = Adam(self.policy.parameters(), lr=1e-3)
self.Qoptimizer = Adam(self.Q.parameters(), lr=1e-3)
self.ounoise=OrnsteinUhlenbeckActionNoise(mean=np.array([0]),sigma=np.array(0.5))
self.buffer=ReplayBuffer(2,1,1000000)
self.gamma=0.99
self.env=gym.make("MountainCarContinuous-v0")
self.test_env=gym.make("MountainCarContinuous-v0")
def run(self):
state,info=self.env.reset()
self.ounoise.reset()
state=torch.from_numpy(state).to(self.device)
n_step=0
# begin
for i in range(self.total_num_step):
#collect data
with torch.no_grad():
if n_step<100:
action=self.env.action_space.sample()
else:
action=self.policy(state).cpu().numpy()
noise=self.ounoise()
action=action+noise
action=action.clip(-1,1)
next_state, reward, terminated, truncated, info = self.env.step(action)
episode_over = 1 if (terminated or truncated) else 0
self.buffer.store(state.cpu().numpy(), action, reward, next_state,episode_over)
if episode_over:
state, info = self.env.reset()
state=torch.from_numpy(state).to(self.device)
self.ounoise.reset()
else:
state = torch.from_numpy(next_state).to(self.device)
n_step+=1
#update
if n_step>100:
lossq ,lossp=self.train_once()
file_writer.add_scalar('lossq', lossq, n_step)
file_writer.add_scalar('lossp', lossp, n_step)
if int(n_step)%1000==0:
print(n_step)
print(self.test())
def train_once(self):
for i in range(1):
self.policy.train()
# sample train data
state, action, reward, next_state, done=self.buffer.sample_batch(self.device,batch_size=256)
with torch.no_grad():
target=reward+self.gamma*self.Q_target(next_state,self.policy_target(next_state))*(1-done)
lossq=F.mse_loss(self.Q(state,action),target)
self.Qoptimizer.zero_grad()
lossq.backward()
self.Qoptimizer.step()
self.Policyoptimizer.zero_grad()
loss_p=-self.Q(state,self.policy(state)).mean()
loss_p.backward()
self.Policyoptimizer.step()
# Finally, update target networks by polyak averaging.
with torch.no_grad():
for p, p_targ in zip(self.policy.parameters(), self.policy_target.parameters()):
# NB: We use an in-place operations "mul_", "add_" to update target
# params, as opposed to "mul" and "add", which would make new tensors.
p_targ.data.mul_(1 - self.tau)
p_targ.data.add_(self.tau* p.data)
for p, p_targ in zip(self.Q.parameters(), self.Q_target.parameters()):
# NB: We use an in-place operations "mul_", "add_" to update target
# params, as opposed to "mul" and "add", which would make new tensors.
p_targ.data.mul_(1 - self.tau)
p_targ.data.add_(self.tau* p.data)
return lossq.item(),loss_p.item()
def test(self):
self.policy.eval()
state,info=self.test_env.reset()
state=torch.from_numpy(state).to(self.device)
episode_over = False
rewards=0
with torch.no_grad():
while not episode_over:
action = self.policy(state) # agent policy that uses the observation and info
next_state, reward, terminated, truncated, info = self.test_env.step(action.cpu().numpy())
episode_over = terminated or truncated
next_state=torch.from_numpy(next_state).to(self.device)
rewards +=reward
state = next_state
return rewards
class ReplayBuffer:
"""
A simple FIFO experience replay buffer for DDPG agents.
"""
def __init__(self, obs_dim, act_dim, size):
self.obs_buf = np.zeros((size,obs_dim), dtype=np.float32)
self.obs2_buf = np.zeros((size,obs_dim), dtype=np.float32)
self.act_buf = np.zeros((size,act_dim), dtype=np.float32)
self.rew_buf = np.zeros((size,1), dtype=np.float32)
self.done_buf = np.zeros((size,1), dtype=np.float32)
self.ptr, self.size, self.max_size = 0, 0, size
def store(self, obs, act, rew, next_obs, done):
self.obs_buf[self.ptr] = obs
self.obs2_buf[self.ptr] = next_obs
self.act_buf[self.ptr] = act
self.rew_buf[self.ptr] = rew
self.done_buf[self.ptr] = done
self.ptr = (self.ptr+1) % self.max_size
self.size = min(self.size+1, self.max_size)
def sample_batch(self, device,batch_size=32):
idxs = np.random.randint(0, self.size, size=batch_size)
batch = dict(obs=self.obs_buf[idxs],
rew=self.rew_buf[idxs],
act=self.act_buf[idxs],
obs2=self.obs2_buf[idxs],
done=self.done_buf[idxs])
return (torch.as_tensor(v, dtype=torch.float32).to(device) for k,v in batch.items())
if __name__=="__main__":
trainer = DDPG()
trainer.run()