RL Implement AC

网络搭建

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Actor-Critic网络
class ActorCritic(nn.Module):
def __init__(self, input_shape, n_actions):
super(ActorCritic, self).__init__()
self.fc1 = nn.Linear(input_shape, 128)
self.fc2 = nn.Linear(128, 128)
self.actor = nn.Linear(128, n_actions)
self.critic = nn.Linear(128, 1)

def forward(self, x): ##服用前两层,增加稳定性
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
actor_output = F.softmax(self.actor(x), dim=-1)
critic_output = self.critic(x)
return actor_output, critic_output

采样过程

1
2
3
4
5
6
7
8
9
10
11
for i in range(1000):                  ##1000个episode
state = env.reset()
done = False
total_reward = 0

while not done: ##一个episode内部,我们可以看到,只采样一条就更新了
action = ac.get_action(state)
next_state, reward, done, info = env.step(action)
ac.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward

训练过程 Actor-Critic

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def update(self, state, action, reward, next_state, done):
state = torch.FloatTensor(state).unsqueeze(0)
next_state = torch.FloatTensor(next_state).unsqueeze(0)
action = torch.LongTensor([action])
reward = torch.FloatTensor([reward])
done = torch.FloatTensor([int(done)])

# 计算Q值
_, next_state_value = self.actor_critic(next_state)
_, state_value = self.actor_critic(state)
q_value = reward + self.gamma * next_state_value * (1 - done) ##计算Q

# 计算actor和critic的loss
log_prob, _ = self.actor_critic(state)
actor_loss = -(log_prob[0][action] * q_value).mean()
critic_loss = F.mse_loss(state_value, q_value.detach()) #拟合V,用TD
loss = actor_loss + critic_loss

# 更新actor和critic的参数
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

训练A2C (优势AC算法)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def update(self, state, action, reward, next_state, done):
state = torch.FloatTensor(state).unsqueeze(0)
next_state = torch.FloatTensor(next_state).unsqueeze(0)
action = torch.LongTensor([action])
reward = torch.FloatTensor([reward])
done = torch.FloatTensor([int(done)])

# 计算advantage
_, next_state_value = self.actor_critic(next_state)
_, state_value = self.actor_critic(state)
advantage = reward + self.gamma * next_state_value * (1 - done) - state_value

# 计算actor和critic的loss
log_prob, _ = self.actor_critic(state)
actor_loss = -(log_prob[0][action] * advantage).mean()
critic_loss = advantage.pow(2).mean()
loss = actor_loss + critic_loss

# 更新actor和critic的参数
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()