目录
- 一、离散动作
- 二、连续动作
- 1、例子1
- 2、知乎给出的示例
- 2、github里面的代码
免责声明:以下代码部分来自网络,部分来自ChatGPT,部分来自个人的理解。如有其他观点,欢迎讨论!
一、离散动作
注意:本文均以PPO算法为例。
# time: 2023/11/22 21:04
# author: YanJP
import torch
import torch
import torch.nn as nn
from torch.distributions import Categorical
class MultiDimensionalActor(nn.Module):
def __init__(self, input_dim, output_dims):
super(MultiDimensionalActor, self).__init__()
# Define a shared feature extraction network
self.feature_extractor = nn.Sequential(
nn.Linear(input_dim, 128),
nn.ReLU(),
nn.Linear(128, 64),
nn.ReLU()
)
# Define individual output layers for each action dimension
self.output_layers = nn.ModuleList([
nn.Linear(64, num_actions) for num_actions in output_dims
])
def forward(self, state):
# Feature extraction
features = self.feature_extractor(state)
# Generate Categorical objects for each action dimension
categorical_objects = [Categorical(logits=output_layer(features)) for output_layer in self.output_layers]
return categorical_objects
# 定义主函数
def main():
# 定义输入状态维度和每个动作维度的动作数
input_dim = 10
output_dims = [5, 8] # 两个动作维度,分别有 3 和 4 个可能的动作
# 创建 MultiDimensionalActor 实例
actor_network = MultiDimensionalActor(input_dim, output_dims)
# 生成输入状态(这里使用随机数据作为示例)
state = torch.randn(1, input_dim)
# 调用 actor 网络
categorical_objects = actor_network(state)
# 输出每个动作维度的采样动作和对应的对数概率
for i, categorical in enumerate(categorical_objects):
sampled_action = categorical.sample()
log_prob = categorical.log_prob(sampled_action)
print(f"Sampled action for dimension {i+1}: {sampled_action.item()}, Log probability: {log_prob.item()}")
if __name__ == "__main__":
main()
#Sampled action for dimension 1: 1, Log probability: -1.4930928945541382
#Sampled action for dimension 2: 3, Log probability: -2.1875085830688477
注意代码中categorical函数的两个不同传入参数的区别:参考链接
简单来说,logits是计算softmax的,probs直接就是已知概率的时候传进去就行。
二、连续动作
参考链接:github、知乎
为什么取对数概率?参考回答
1、例子1
先看如下的代码:
# time: 2023/11/21 21:33
# author: YanJP
#这是对应多维连续变量的例子:
# 参考链接:https://github.com/XinJingHao/PPO-Continuous-Pytorch/blob/main/utils.py
# https://www.zhihu.com/question/417161289
import torch.nn as nn
import torch
class Policy(nn.Module):
def __init__(self, in_dim, n_hidden_1, n_hidden_2, num_outputs):
super(Policy, self).__init__()
self.layer = nn.Sequential(
nn.Linear(in_dim, n_hidden_1),
nn.ReLU(True),
nn.Linear(n_hidden_1, n_hidden_2),
nn.ReLU(True),
nn.Linear(n_hidden_2, num_outputs)
)
class Normal(nn.Module):
def __init__(self, num_outputs):
super().__init__()
self.stds = nn.Parameter(torch.zeros(num_outputs)) #创建一个可学习的参数
def forward(self, x):
dist = torch.distributions.Normal(loc=x, scale=self.stds.exp())
action = dist.sample((every_dimention_output,)) #这里我觉得是最重要的,不填sample的参数的话,默认每个分布只采样一个值!!!!!!!!
return action
if __name__ == '__main__':
policy = Policy(4,20,20,5)
normal = Normal(5) #设置5个维度
every_dimention_output=10 #每个维度10个输出
observation = torch.Tensor(4)
action = normal.forward(policy.layer( observation))
print("action: ",action)
- self.stds.exp(),表示求指数,因为正态分布的标准差都是正数。
- action = dist.sample((every_dimention_output,))这里最重要!!!
2、知乎给出的示例
class Agent(nn.Module):
def __init__(self, envs):
super(Agent, self).__init__()
self.actor_mean = nn.Sequential(
layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01),
)
self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))
def get_action_and_value(self, x, action=None):
action_mean = self.actor_mean(x)
action_logstd = self.actor_logstd.expand_as(action_mean)
action_std = torch.exp(action_logstd)
probs = Normal(action_mean, action_std)
if action is None:
action = probs.sample()
return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)
这里的np.prod(envs.single_action_space.shape),表示每个维度的动作数相乘,然后初始化这么多个actor网络的标准差和均值,最后action里面的sample就是采样这么多个数据。(感觉还是拉成了一维计算)
2、github里面的代码
github
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Beta,Normal
class GaussianActor_musigma(nn.Module):
def __init__(self, state_dim, action_dim, net_width):
super(GaussianActor_musigma, self).__init__()
self.l1 = nn.Linear(state_dim, net_width)
self.l2 = nn.Linear(net_width, net_width)
self.mu_head = nn.Linear(net_width, action_dim)
self.sigma_head = nn.Linear(net_width, action_dim)
def forward(self, state):
a = torch.tanh(self.l1(state))
a = torch.tanh(self.l2(a))
mu = torch.sigmoid(self.mu_head(a))
sigma = F.softplus( self.sigma_head(a) )
return mu,sigma
def get_dist(self, state):
mu,sigma = self.forward(state)
dist = Normal(mu,sigma)
return dist
def deterministic_act(self, state):
mu, sigma = self.forward(state)
return mu
上述代码主要是通过设置mu_head 和sigma_head的个数,来实现多维动作。
class GaussianActor_mu(nn.Module):
def __init__(self, state_dim, action_dim, net_width, log_std=0):
super(GaussianActor_mu, self).__init__()
self.l1 = nn.Linear(state_dim, net_width)
self.l2 = nn.Linear(net_width, net_width)
self.mu_head = nn.Linear(net_width, action_dim)
self.mu_head.weight.data.mul_(0.1)
self.mu_head.bias.data.mul_(0.0)
self.action_log_std = nn.Parameter(torch.ones(1, action_dim) * log_std)
def forward(self, state):
a = torch.relu(self.l1(state))
a = torch.relu(self.l2(a))
mu = torch.sigmoid(self.mu_head(a))
return mu
def get_dist(self,state):
mu = self.forward(state)
action_log_std = self.action_log_std.expand_as(mu)
action_std = torch.exp(action_log_std)
dist = Normal(mu, action_std)
return dist
def deterministic_act(self, state):
return self.forward(state)
class Critic(nn.Module):
def __init__(self, state_dim,net_width):
super(Critic, self).__init__()
self.C1 = nn.Linear(state_dim, net_width)
self.C2 = nn.Linear(net_width, net_width)
self.C3 = nn.Linear(net_width, 1)
def forward(self, state):
v = torch.tanh(self.C1(state))
v = torch.tanh(self.C2(v))
v = self.C3(v)
return v
上述代码只定义了mu的个数与维度数一样,std作为可学习的参数之一。