七三笔记

Softmax 源码参考

Softmax 源码

 

/*
** 输入： input   一组输入图片数据（含义见下面softmax_cpu()注释，下同）
**       n       一组输入数据中含有的元素个数n=l.inputs/l.groups
**       temp    温度参数，关于softmax的温度参数，可以搜索一下softmax with temperature，应该会有很多的
**       stride  跨度
**       output  这一组输入图片数据对应的输出（也即l.output中与这一组输入对应的某一部分）
** 说明：本函数实现的就是标准的softmax函数处理，唯一有点变化的就是在做指数运算之前，将每个输入元素减去了该组输入元素中的最大值，以增加数值稳定性，
**      关于此，可以参考博客：http://freemind.pluskid.org/machine-learning/softmax-vs-softmax-loss-numerical-stability/，
**      这篇博客写的不错，博客中还提到了softmax-loss，此处没有实现（此处实现的也即博客中提到的softmax函数，将softmax-loss分开实现了）。
*/
void softmax(float *input, int n, float temp, int stride, float *output)
{
    int i;
    float sum = 0;
    // 赋初始最大值为float中的最小值-FLT_MAX（定义在float.h中）
    float largest = -FLT_MAX;
    // 寻找输入中的最大值，至于为什么要找出最大值，是为了数值计算上的稳定，详细请戳：http://freemind.pluskid.org/machine-learning/softmax-vs-softmax-loss-numerical-stability/
    // 这篇博客写的不错，博客在接近尾声的时候，提到了为什么要减去输入中的最大值。
    for(i = 0; i < n; ++i){
        if(input[i*stride] > largest) largest = input[i*stride];
    }
    for(i = 0; i < n; ++i){
        // 在进行指数运算之间，如上面博客所说，首先减去最大值（当然温度参数也要除）
        float e = exp(input[i*stride]/temp - largest/temp);
        sum += e;                       // 求和
        output[i*stride] = e;           // 并将每一个输入的结果保存在相应的输出中
    }
    // 最后一步：归一化转换为概率（就是softmax函数的原型～），最后的输出结果保存在output中
    for(i = 0; i < n; ++i){
        output[i*stride] /= sum;
    }
}

/**
    * @brief 对输入input进行softmax处理得到输出output
    * @param input    softmax层所有输入数据（包含整个batch的），即net.input（上一层的输出）
    * @param n        一组输入数据中含有的元素个数n=l.inputs/l.groups
    * @param batch    一个batch中所含有的图片张数（等于net.batch）
    * @param batch_offset    一张输入图片含有的元素个数，即值等于l.inputs（所以叫做batch_offset，目的是要借助该参数在input中整张整张照片移位）
    * @param groups   一张输入图片的元素被分成了几组，值为l.groups（这个参数由配置文件指定，如果未指定，则默认为1）,这个参数暂时还没遇到怎么用，
    *                 大部分的网络值都为1,也即相当于没有这个参数
    * @param group_offset    值等于n，组偏移（在每张输入图片元素中整组整组偏移）
    * @param stride  跨度，这个参数类似于axpy_cpu()函数中的INCX参数，一定注意不同于卷积层中的l.stride，这个参数是指按照stride间隔从每组输入
    *                数据中抽取元素，即会抽取所有索引为stride倍数的输入元素，而其他的输入元素，实际没有用到；stride=1时，显然，相当于没有这个参数，
    *                所有输入数据都用到了（这个参数在softmax_layer层中，相当于没用，因为在forward_softmax_layer()中，调用该函数时，stride已经
    *                被写死为1,并不能改，不知道还有没有其他地方使用了这个参数）
    * @param temp     softmax的温度参数l.temperature，关于softmax的温度参数，可以搜索一下softmax with temperature，应该会有很多的
    * @param output   经softmax处理之后得到的输出l.output（即概率），与input具有相同的元素个数（见make_softmax_layer()），其实由此也可知，
    *                stride的值必然为1,不然output的元素个数肯定少于input的元素个数（所以对于softmax来说，感觉设置stride是没有必要的，有点自相矛盾的意思）
    * @note 以上注释针对的是softmax_layer，另有不同地方调用本函数的在调用处进行详细注释；上面的注释出现了新的量词单位，这里厘清一下关系：输入input
    *        中包括batch中所有图片的输入数据，其中一张图片具有inputs个元素，一张图片的元素又分成了groups组，每组元素个数为n=l.inputs/l.groups
*/
void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
{
    int g, b;
    // 遍历batch中的每张图片
    for(b = 0; b < batch; ++b){
        // 每张图片又按组遍历：一组一组遍历
        for(g = 0; g < groups; ++g){
            softmax(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
        }
    }
}

去最大值实现

 
import torch
a = torch.tensor([[1,2,3],[3,5,7]])
a= a  - a.max()
a

tensor([[-6, -5, -4],
        [-4, -2,  0]])

 
a=torch.exp(a)
a_len=a.shape[0]
for i in range(a_len):
    a[i]=a[i]/a[i].sum()
    
a
tensor([[0.0900, 0.2447, 0.6652],
        [0.0159, 0.1173, 0.8668]])

不去最大值

 
import torch
a = torch.tensor([[1,2,3],[3,5,7]])
a=torch.exp(a)
a_len=a.shape[0]
for i in range(a_len):
    a[i]=a[i]/a[i].sum()
    
a
tensor([[0.0900, 0.2447, 0.6652],
        [0.0159, 0.1173, 0.8668]])

总结

 
最终结果一样，个人猜测可能是因为用到了exp指数函数，
如果一个数过于大，那么exp指数运算后将来产生一个极大的数
全部转为负数后，指数运算产生的都是小数
计算机的确更擅长处理小于1的浮点数

代码摘要

 
batch_size = x.shape[0]

#统一转换到负数(非正),这样exp运算后也不会出现极大的数
x = torch.exp(x-x.max())
for i in range(batch_size):
    x[i]=x[i]/x[i].sum()

全代码

 
import torch
from torch import nn
import torchvision

 
class DLModel(nn.Module):
"""模型定义
"""

def __init__(self, in_features, out_features):
    """参数网络设计
    - 总体来说，做的事件是将数据从一个维度转换到另外一个维度
    """
    super(DLModel, self).__init__()
    
    self.linear = nn.Linear(in_features=in_features, out_features=out_features)
    
def forward(self, X):
    """正向传播
    - 调用定义的参数网络
    - 让数据流过参数网络，常量数据流过不过的参数产生不同的值
    - 这个过程参数本身不会变
    - 让参数变化的是后面的优化器 
    """
    x = self.linear(X)
    
    batch_size = x.shape[0]
    print("batch_size:",batch_size)
    
    #统一转换到负数(非正),这样exp运算后也不会出现极大的数
    x = torch.exp(x-x.max())
    for i in range(batch_size):
        x[i]=x[i]/x[i].sum()
        
    out = x
    
    return out
    
model = DLModel(in_features=32, out_features=32)

#[B,C,H,W]
h0 = torch.zeros(64, 3, 32, 32)

y_out = model(X=h0)

y_out.shape

 
#[B,C,H,W],trace需要通过实际运行一遍模型导出其静态图，故需要一个输入数据
h0 = torch.zeros(1, 3, 32, 32)

# trace方式，在模型设计时不要用for循环，
# 能在模型外完成的数据操作不要在模型中写,
# 不用inplace等高大上的语法，保持简单，简洁，否则onnx可能无法完全转换过去
torch.onnx.export(
    model=model, 

    # model的参数，就是原来y_out = model(args)的args在这里指定了
    # 有其shape能让模型运行一次就行，不需要真实数据
    args=(h0,), 

    # 储存的文件路径
    f="model02.onnx",  
    
    # 导出模型参数，默认为True
    export_params = True, 
    
    # eval推理模式，dropout，BatchNorm等超参数固定或不生效
    training=torch.onnx.TrainingMode.EVAL,  

    # 打印详细信息
    verbose=True, 

    # 为输入和输出节点指定名称，方便后面查看或者操作
    input_names=["input1"], 
    output_names=["output1"], 

    # 这里的opset，指各类算子以何种方式导出，对应于symbolic_opset11
    opset_version=11, 

    # batch维度是动态的，其他的避免动态
    dynamic_axes={
        "input1": {0: "batch"},
        "output1": {0: "batch"},
    }
)

 
batch_size: tensor(1)
Exported graph: graph(%input1 : Float(*, 3, 32, 32, strides=[3072, 1024, 32, 1], requires_grad=0, device=cpu),
        %linear.bias : Float(32, strides=[1], requires_grad=1, device=cpu),
        %onnx::MatMul_62 : Float(32, 32, strides=[1, 32], requires_grad=0, device=cpu)):
    %/linear/MatMul_output_0 : Float(*, 3, 32, 32, device=cpu) = onnx::MatMul[onnx_name="/linear/MatMul"](%input1, %onnx::MatMul_62), scope: __main__.DLModel::/torch.nn.modules.linear.Linear::linear # /ai/app/anaconda3/lib/python3.9/site-packages/torch/nn/modules/linear.py:114:0
   ...
   ...
   ...
    %/Concat_2_output_0 : Long(4, strides=[1], device=cpu) = onnx::Concat[axis=0, onnx_name="/Concat_2"](%/Constant_11_output_0, %/Slice_2_output_0), scope: __main__.DLModel:: # /tmp/ipykernel_482/950586334.py:31:0
    %/Reshape_2_output_0 : Float(*, *, *, *, device=cpu) = onnx::Reshape[onnx_name="/Reshape_2"](%/Expand_2_output_0, %/Concat_2_output_0), scope: __main__.DLModel:: # /tmp/ipykernel_482/950586334.py:31:0
    %output1 : Float(*, 3, 32, 32, strides=[3072, 1024, 32, 1], requires_grad=1, device=cpu) = onnx::ScatterND[onnx_name="/ScatterND_2"](%/ScatterND_1_output_0, %/Constant_12_output_0, %/Reshape_2_output_0), scope: __main__.DLModel:: # /tmp/ipykernel_482/950586334.py:31:0
    return (%output1)

 
import torch
from torch import nn
import torchvision

 
class DLModel(nn.Module):
    """模型定义
    """
    
    def __init__(self, in_features, out_features):
        """参数网络设计
        - 总体来说，做的事件是将数据从一个维度转换到另外一个维度
        """
        super(DLModel, self).__init__()
        
        self.linear = nn.Linear(in_features=in_features,out_features=out_features)
        
    def forward(self, X):
        """正向传播
        - 调用定义的参数网络
        - 让数据流过参数网络，常量数据流过不过的参数产生不同的值
        - 这个过程参数本身不会变
        - 让参数变化的是后面的优化器 
        """
        x = self.linear(X)
        
        batch_size = x.shape[0]
        print("batch_size:",batch_size)
       
        #统一转换到负数(非正),这样exp运算后也不会出现极大的数
        x = torch.exp(x-x.max())
        for i in range(batch_size):
            x[i]=x[i]/x[i].sum()
            
        out = x
        
        return out

最后一层是全连接，输出softmax时的数据shape为[B,C]

 
model = DLModel(in_features=5, out_features=2)

#[B,C]
h0 = torch.zeros(3, 5)

y_out = model(X=h0)

y_out.shape

softmax将之转化为概率

 
y_out
tensor([[0.3417, 0.6583],
[0.3417, 0.6583],
[0.3417, 0.6583]], grad_fn=CopySlices)

参考

七三笔记路线：学习，记录，分享