🔥【pytorch】基本语法

安装与配置

看CUDA版本

nvidia-smi

官网上的安装教程

2.x的重大更新

# 编译，极大提升性能
model = torch.compile(model)

# torch.distributed、FSDP、DTensor 等能力持续改进

# Transformer 常用算子（如 attention 相关）和算子实现上持续优化

GPU

import torch
from torch import nn

torch.cuda.is_available() # 返回 True/False 表示GPU是否可用
torch.cuda.device_count() # 可用的GPU数量

使用GPU

mytensor = my_tensor.to(device) # tensor
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 'cuda' 等同于 'cuda:X',其中X是torch.cuda.current_device()
model = model.to(device) # 返回的 model 和输入的 model 是同一个对象，这个行为与下面的 tensor 不同

print(mytensor.device)
# 看模型在哪个设备上：(原理是看第一个参数所在的位置)
print(next(model.parameters()).device)

并行

# torch 默认不会做多显卡计算，用这个
model = nn.DataParallel(model)
# 目前替换：nn.parallel.DistributedDataParallel

# 设定这个之后，CPU占用会极大提高
torch.set_num_threads(num_physical_cores/num_workers)

# 加载并行
DataLoader(..., num_workers=args.nThreads)

[性能相关的其它资料] https://zhuanlan.zhihu.com/p/69250939

Tensor

新建

torch.empty(5, 3) 
torch.ones(3, 3)
torch.ones_like(...)
torch.zeros(5, 3, dtype=torch.long)
torch.eye(5)

torch.arange(start=0, end=8, step=2) # 含头不含尾
torch.linspace(start=0, end=9, steps=5) # 均匀的取5个值，含头含尾


# 随机生成
torch.manual_seed(2)         # 设置种子
print(torch.initial_seed())  # 查看种子
torch.rand(5, 3)             # 均匀分布 0～1
torch.randn(5, 3)            # 标准正态分布
torch.randn_like(x, dtype=torch.float)


# 从其它数据新建
torch.tensor([[5.5, 3],[2,3]], dtype=torch.float32)
torch.from_numpy(np.ones((5,5))) 

# 说明：
# 1. 全部可以有入参 device=device，或者 device='cuda:0'
# 2. 全部可以用 dtype=torch.float32 指定数据类型，数据类型如下
# 3. 要在同一个 device 上才可以运行

device

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device 还可以是字符串: device = 'cpu' # device = 'cuda'

# 在 CPU 上生成，然后转到 GPU
x = torch.randn(4, 4).to(device)
# 在 GPU 生成，然后转到 CPU
x = torch.randn(4, 4, device=device).cpu()
# 也可以用 to(device) 转 CPU

# 说明：转换设备时发生数据复制

数据类型转换

x.int()
x.long()
x.float()
x.bool()
x.char() # int8 类型
x.double()

# 有这些类型（还有很多）
torch.bool
torch.int
torch.short
torch.uint8 # 这个是 ByteTensor
torch.int
torch.int8
torch.int16
torch.int32
torch.long
torch.float
torch.float16
torch.float32
torch.float64
torch.double
torch.complex32
torch.complex64

基本运算

# 四舍五入
a.round()
a.fix()
a.floor()


a + b
a * b    # 矩阵元素积
a @ b    # 矩阵积
a += 1

# 可以指定输出到哪个变量
result = torch.empty(5, 3)
torch.add(a, b, out=result)


# 大多数运算符后面可以加个下划线，表示替换运算
# 替换加
y.add_(x) # 这个会把加法的结果赋值给y
# 如 x.copy_(y), x.t_()

# 这些运算也可以不用符号，而是用函数表示:
a.matmul(b)

取数

a.size() # torch.Size([5, 3])
a.numel() # 共多少个元素

# index 和 Numpy 一样
a[:, 1]

# 获取某一行
a.select(dim=1,index=2) # 获取按照 dim 计算的 第 index 组。
# 例子的 dim=1 表示获取的是列，index=2 表示获取第 2 列


# tensor 转其它格式
a.numpy() # 转 np.array
a.tolist() # 转 list

# 转 Python 数字，只有单个元素的时候可以用
a[0][0].item()

# 注意一个特性: 共享内存
x = torch.ones(2)
y = x.numpy()
x += 1
print(x, y)
# 打印：tensor([2., 2.]) [2. 2.]

# 值得一提，反过来也是这个特性
a = np.ones(5)
b = torch.from_numpy(a)
a += 1
print(a, b)
# [2. 2.] tensor([2., 2.])

reshape

# reshape 也可以做下面这些事，但不能reshape到1维（？不知道为什么要这么设计）

a = torch.randn(4, 4)
a.reshape(2, -1)
a.reshape(-1)

数学运算

a.sqrt()
a.square()
a.exp()

a.cos()
a.cosh()
a.acos()
a.acosh()
a.arccos()
a.arccosh()

linalg

from torch import linalg

a = torch.rand(5, 5)


U, S, Vh = linalg.svd(a)
# full_matrices=False，可以提高性能

# 只返回特征值：
linalg.svdvals(a)

e_val, e_vec = linalg.eig(a)
linalg.eigvals(a)


# linalg 还有很多方法，例如：
linalg.norm
linalg.qr

矩阵操作

d = a.diagonal()           # 取出对角线
a1 = torch.diag_embed(d)   # 还原为对角矩阵

a.flip(dims=(0,)) # 按照 dims 确定的维度翻转

a.t() # 转秩


a.tril(k=0) # 下三角矩阵
a.triu(k=0) # 上三角矩阵

# cat
a = torch.rand(3, 2)
b = torch.rand(3, 2)

torch.cat([a, b], dim=0)

# 分割
a = torch.arange(0,12,step=1).reshape(2,6)
a.chunk(chunks=3, dim=1) # 尽量均匀分为三份
# 分为3份，大小分别是 1，3，2
torch.split(x, split_size_or_sections=(1, 3, 2), dim=1)
# 按 dim=1 分为3份，其大小分别为 1, 3, 2

where

torch.where(x1 > 0.5, x1, x2)
torch.clip(x1, min=0.4, max=0.6)

按位运算

# dtype 必须是 int 类型，最好是 uint8
x1 = torch.tensor([1, 2, 3], dtype=torch.uint8)
x2 = torch.tensor([1, 1, 1], dtype=torch.uint8)


x1 & x2  # 按位与
x1 | x2  # 按位或
~x1  # 按位非
x1 ^ x2  # 按位异或
# 以上对应的运算符为：x1.bitwise_or(x2) 等类似的东西

x1 << 1  # 移位运算
# x1.bitwise_left_shift(1)

逻辑运算

# 0 转为 False，别的数字都转为 True
x1 = torch.tensor([-0.9, 0, True, False], dtype=torch.bool)

# >、<、==、 >=、 <= 都可以
x2 = torch.rand(size=(4,)) < 0.5

# 逻辑与、或、异或、非
x1 & x2
x1 | x2
x1 ^ x2
~x1

# 其它方式：
# x1.logical_and(x2)
# x1.logical_or(x2)
# x1.logical_xor(x2)
# x1.logical_xor_(x2)
# x1.logical_not()
# x1.logical_not_()

统计类运算

a.mean()
a.mean(dim=1,keepdim=True)

a.max()
values, indices = a.max(dim=1, keepdim=True)

a.min()
a.mode()

values, indices = a.sort(dim=1, descending=False)

a.argmin()
a.argsort()
a.argmax(dim=1, keepdim=True)

a.histc
a.histogram

a.std()

激活函数

import torch.nn as nn
import torch.nn.functional as F
# 很多激活函数，在上面两个模块中是等价的，使用上的区别：
# nn 提供的是模块形式，适合放进 nn.Sequential
# F 提供的是函数形式，更灵活，适合条件调用、灵活的场景


# Module 方式
act = nn.ReLU(inplace=False)
y = act(x)

# Functional 方式
y = F.relu(x, inplace=False)

激活函数

nn.ReLU ReLU(Rectifier Linear Unit), $\max(0,x)$
nn.ReLU6 是 hard-sigmoid 的变种 $\min(\max(0,x),6)$，移动端/量化友好
nn.Sigmoid $1/(1+\exp(-x))$，优点是不容易出现极端只，但因为梯度消失问题，已经很少用了
nn.Tanh $\dfrac{e^{2x}-1}{e^{2x}+1}$
nn.Softsign 是符号函数的连续估计$x/(abs(x)+1)$
nn.Softplus 是ReLU的平滑版 $\log(\exp(x)+1)$
nn.ELU ELU(Exponential Linear Unit) $\begin{cases} x, & x>0 \\ \alpha\left(e^{x}-1\right), & x\le 0 \end{cases}$

nn.Softmax(dim=dim)
torch.softmax(x, dim=1)    # 用于图片，对 channel 做 softmax
nn.LogSoftmax()  # 对 softmax 取对数，对应的损失函数是 NLLLoss

# 更现代的写法
nn.CrossEntropyLoss
# 等价于 nn.LogSoftmax + nn.NLLLoss

2019年5月22日更新（来自吴恩达的 DeepLearning.ai 课程）：

sigmoid: never use, except output layer
tanh: pretty much strictly superior then sigmoid
ReLU: if you do not know which to choose, always choose ReLU
Leaky ReLU： you may try this $max(0.01z,z)$

激活函数的微分

sigmoid:$g(x)=\dfrac{1}{1+e^{-z}},g’(z)=g(z)(1-g(z))$
tanh:$g(x)=\dfrac{e^z-e^{-z}}{e^z+e^{-z}},g’(z)=1-(g(z))^2$
ReLU/Leaky RelU：分段函数，注意0点的情况（但0点不是很重要）

卷积相关

# nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear
nn.Conv2d(in_channels=1
                , out_channels=16, kernel_size=(5, 5), stride=(1, 1)
                , padding='same' # 或者 padding=2
)

nn.MaxPool2d(kernel_size=2)

建立模型

import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict


class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()

        # 方法1：最常用
        self.fc1 = nn.Linear(10, 10)
        # 方法2：动态命名，尽量别用
        self.add_module("fc2", nn.Linear(10, 10))
        # 方法3：
        self.block3 = nn.Sequential(OrderedDict([
            ("conv1", torch.nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(5, 5))),
            ("relu1", nn.ReLU())
        ]))
        # 方法4：
        self.add_module("block4", nn.Sequential(OrderedDict([
            ("conv1", torch.nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(5, 5))),
            ("relu1", nn.ReLU())
        ])))

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # forward 中不加入就不起作用
        # x = self.block3(x)
        # x = self.block4(x)
        return x


my_net = MyNet()
print(my_net)

如何使用模型

x = torch.randn(8, 10)

y = my_net(x)

保存和载入模型

# 存
torch.save(my_net.state_dict(),'./model.pkh')

my_net1 = MyNet().to(device)
# 读
my_net1.load_state_dict(torch.load('./model.pkh', weights_only=True))
# weights_only 只加载权重，更安全

# 关于 device 的说明：
# 1. state_dict 时，会连同所在设备号一起保存；在 load 时，会 load 到对应的设备上
# 2. 使用 map_location 可以指定 load 到哪个设备上
# 3. my_net1.load_state_dict(xxx) 时，会加载到 my_net1 所在的设备上。无论 load 时这些 tensor 在哪个设备上

# 指定参数 load 到哪个设备上：
device3 = torch.device("cuda:3")
torch.load('model.pkh', map_location=device3, weights_only=True)

# 或者
troch.load('model.pkh', map_location={"cuda:1":"cuda:0"})

连同训练状态一起保存，以便断点续训

# 保存
ckpt = {
    "epoch": epoch,
    "model_state": model.state_dict(),
    "optim_state": optimizer.state_dict(),
    # 可选：
    # "sched_state": scheduler.state_dict(),
    # "scaler_state": scaler.state_dict(),   # AMP 用
}
torch.save(ckpt, "ckpt.pth")

# 加载
model = MyNet().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

ckpt = torch.load("ckpt.pth", map_location=device)
model.load_state_dict(ckpt["model_state"])
optimizer.load_state_dict(ckpt["optim_state"])

start_epoch = ckpt["epoch"] + 1
model.train()
for epoch in range(start_epoch, num_epochs):
    ...

safetensors: 更快、更安全、更标准的 tensor 保存格式

# pip install safetensors

# 保存
from safetensors.torch import save_file

state = my_net.state_dict()
save_file(state, "model.safetensors")

# 读取
from safetensors.torch import load_file

state = load_file("model.safetensors", device='cuda:1')  # 默认在 CPU，可以指定设备
my_net1 = MyNet()
my_net1.load_state_dict(state)

案例

model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
  model = nn.DataParallel(model)

model.to(device)

其它

torch.get_default_dtype()
torch.set_default_dtype(torch.float16)


torch.ByteTensor([1.1, 2, 3]) # 对于小数，会取整。对于溢出的数（大于255或负的），会舍弃溢出位数
# 但是，如果输入的是Tensor，会卡死，这么解决：
torch.tensor([1, 2, 3]).type(torch.int8)