pytorch框架自动调整学习率的几种方式

目录一、引言一、引言学习率的对于炼丹师来说非常重要，本文主要总结几种pytorch中常用的几种调整学习率的几种方式。学习率要在损失和收敛速度中做出权衡。学习率对于深度学习炼丹来说尤为重要，一个合适的学习率不仅能加速训练的拟合，还能更好地逼近最优解。固定的学习率随着深度学习模型逐渐上升的复杂性已不太适用，动态调整学习率或者对模型不同部分设置不同的学习率已成为一种炼丹趋势1。参考一所有Optimi

Salute=

6193人浏览 · 2022-04-24 22:26:36

Salute= · 2022-04-24 22:26:36 发布

一、前言

学习率要在收敛和收敛速度中做出权衡，合适的学习率能以最快速度逼近最优解同时使得损失不断下降。在复杂网络中，固定的学习率一般无法得到网络的最优解，动态调整学习率或对模型不同部分设置不同的学习率已成为一种训练模型的趋势趋势。

二、Pytorch中自动调整学习率的几种方式

2.1 ExponentialLR-指数衰减方式

torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma,last_epoch=-1)

其中，optimizer为指定的优化器， $\gamma$ 为指数的底数，通常将 $\gamma$ 设置为接近于1的数。
学习率更新公式如下所示：
$lr_{new}=lr_{init}*\gamma^{epoch}$ 其中， $lr_{init}$ 为初始学习率， $lr_new$ 为更新后的学习率， $e p o c h$ 为当前训练迭代次数。

import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ExponentialLR
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
plt.rcParams['font.sans-serif'] = 'SimHei'         # 显示中文
plt.rcParams['axes.unicode_minus'] = False         # 显示负号
SEED = 40                                          # 设置随机种子，使得每次运行时产生的随机数一致


# 构建网络
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear = nn.Linear(50, 100)
        self.leakrelu = nn.LeakyReLU(0.001)
        self.linear1 = nn.Linear(100, 50)
    # 前向传播
    def forward(self, input):
        x = self.leakrelu(self.linear(input))
        output = self.leakrelu(self.linear1(x))
        return output

lr = []                              # 保存每个epoch的学习率
epochTrainLoss = []                  # 保存每个epoch的训练损失
epochValLoss = []                    # 保存每个epoch的校验损失
maxepoch = 50
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

# 产生训练所需数据
input_train = torch.randn(10, 50)      # 训练输入数据
target_train = torch.randn(10, 50)     # 训练标签
input_val = torch.randn(10, 50)        # 校验输入数据
target_val = torch.randn(10, 50)       # 校验所用标签

net = Net()                          # 初始化网络
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)   # 初始化优化器-Adam
scheduler = ExponentialLR(optimizer, gamma=0.9)          # 以指数方式调整学习率
loss_fn = torch.nn.MSELoss()         # 初始化损失函数
loss_fn.to(DEVICE)


# 训练模型
for epoch in range(maxepoch):
    lr.append(scheduler.get_last_lr()[0])
    print(epoch, scheduler.get_last_lr()[0])
    # 将训和校验数据和模型加载到device中
    net = net.to(DEVICE)
    input_train = input_train.to(DEVICE)
    target_train = target_train.to(DEVICE)
    input_val = input_val.to(DEVICE)
    target_val = target_val.to(DEVICE)

    net.train()                 # 进入训练模式
    net.zero_grad()             # 将上一次训练保存的梯度清零
    output = net(input_train)   # 将输入数据输入到模型中
    trainLoss = loss_fn(output, target_train)         # 计算训练损失
    trainLoss.backward()        # 反向传播
    epochTrainLoss.append(trainLoss.cpu().detach().numpy())
    optimizer.step()            # 更新模型参数
    scheduler.step()            # 更新学习率

    net.eval()                  # 进入测试模式
    with torch.no_grad():       # 不考虑梯度
        output_val = net(input_val)
        valLoss = loss_fn(output_val, target_val)     # 校验损失
        epochValLoss.append(valLoss.cpu().detach().numpy())
    print("epoch = %02d  trainLoss = %.4f valLoss = %.4f" % (epoch, trainLoss, valLoss))

# 绘制学习率变化曲线图
plt.figure()
x = list(range(maxepoch))
plt.plot(x, lr)
plt.xlabel('epochs')
plt.ylabel('lr')
plt.show()

# 绘制损失曲线图
plt.figure()
plt.plot(x, epochTrainLoss)
plt.plot(x, epochValLoss)
plt.legend(['trainLoss','valLoss'])
plt.xlabel('epochs')
plt.ylabel('Loss')
plt.show()

2.2 ExponentialLR方式对网络训练的影响

(1) 学习率按照ExponentialLR方式衰减，初始参数如下所示：
1) 初始学习率 $lr_{init}=0.1$
2) ExponentialLR参数 $\gamma=0.9$

(2) 学习率为固定常数
1) 初始学习率 $lr_{init}=0.01$
2) ExponentialLR参数 $\gamma=1$

2.3 MultiStepLR-按给定间隔调整学习率

torch.optim.lr_scheduler.MultiStepLR(optimizer,milestones,gamma,last_epoch=-1)

其中，optimizer为指定的优化器，milestones为指定衰减区间， $\gamma$ 为指数的底数，通常将 $\gamma$ 设置为接近于1的数。当 $milestones=[x_{1}, x_{2}]$ , 且满足 $x_{1}<x_{2}<maxEpoch=50$ 时，更新公式如下所示：
$r_{\text {new }}= \begin{cases}l r_{\text {init }} * \gamma & \text { epoch } \in\left[0, x_{1}\right) \\ l r_{\text {init }} * \gamma & \text { epoch } \in\left[x_{1}, x_{2}\right) \\ l r_{i n i t} * \gamma & \text { epoch } \in\left[x_{2}, 50\right)\end{cases}$ 其中， $lr_{init}$ 为初始学习率， $lr_{new}$ 为更新后的学习率， $e p o c h$ 为当前训练迭代次数， $m a x E p o c h = 50$ 为最大训练次数。

import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ExponentialLR, MultiStepLR
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
plt.rcParams['font.sans-serif'] = 'SimHei'         # 显示中文
plt.rcParams['axes.unicode_minus'] = False         # 显示负号
SEED = 40                                          # 设置随机种子，使得每次运行时产生的随机数一致


# 构建网络
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear = nn.Linear(50, 100)
        self.leakrelu = nn.LeakyReLU(0.001)
        self.linear1 = nn.Linear(100, 50)

    # 获取模型权重和偏置参数，并封装未params_group格式
    def get_params(self, weight_decay=0.0):
        weights, biases = [], []
        for name, param in self.named_parameters():
            if 'bias' in name:
                biases += [param]
            else:
                weights += [param]
        params = [{
            'params': weights,
            'weight_decay': weight_decay,
        }, {
            'params': biases,
            'weight_decay': 0.0,
        }]
        return params

    # 前向传播
    def forward(self, input):
        x = self.leakrelu(self.linear(input))
        output = self.leakrelu(self.linear1(x))
        return output


lr = []                              # 保存每个epoch的学习率
epochTrainLoss = []                  # 保存每个epoch的训练损失
epochValLoss = []                    # 保存每个epoch的校验损失
maxepoch = 50
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

# 产生训练所需数据
input_train = torch.randn(10, 50)      # 训练输入数据
target_train = torch.randn(10, 50)     # 训练标签
input_val = torch.randn(10, 50)        # 校验输入数据
target_val = torch.randn(10, 50)       # 校验所用标签

net = Net()                          # 初始化网络
optimizer = torch.optim.Adam(net.parameters(), lr=0.1)   # 初始化优化器-Adam
scheduler = MultiStepLR(optimizer, milestones=[4, 20, 40, 50], gamma=0.9)          # 以指数方式调整学习率

loss_fn = torch.nn.MSELoss()         # 初始化损失函数
loss_fn.to(DEVICE)

# ================================== 训练 ===================================#
for epoch in range(maxepoch):
    lr.append(scheduler.get_last_lr()[0])
    print(epoch, scheduler.get_last_lr()[0])
    # 将训和校验数据和模型加载到device中
    net = net.to(DEVICE)
    input_train = input_train.to(DEVICE)
    target_train = target_train.to(DEVICE)
    input_val = input_val.to(DEVICE)
    target_val = target_val.to(DEVICE)

    net.train()                 # 进入训练模式
    net.zero_grad()             # 将上一次训练保存的梯度清零
    output = net(input_train)   # 将输入数据输入到模型中
    trainLoss = loss_fn(output, target_train)         # 计算训练损失
    trainLoss.backward()        # 反向传播
    epochTrainLoss.append(trainLoss.cpu().detach().numpy())
    optimizer.step()            # 更新模型参数
    scheduler.step()            # 更新学习率

    net.eval()                  # 进入测试模式
    with torch.no_grad():       # 不考虑梯度
        output_val = net(input_val)
        valLoss = loss_fn(output_val, target_val)     # 校验损失
        epochValLoss.append(valLoss.cpu().detach().numpy())
    print("epoch = %02d  trainLoss = %.4f valLoss = %.4f" % (epoch, trainLoss, valLoss))


# 绘制损失曲线
plt.figure()
x = list(range(maxepoch))
plt.plot(x, lr)
plt.title('学习率变化曲线图')
plt.xlabel('epochs')
plt.ylabel('lr')
plt.show()

# 绘制损失曲线图
plt.figure()
plt.plot(x, epochTrainLoss)
plt.plot(x, epochValLoss)
plt.title('损失曲线图')
plt.legend(['trainLoss','valLoss'])
plt.xlabel('epochs')
plt.ylabel('Loss')
plt.show()

2.4 MultiStepLR调整学习率对网络训练的影响

(1) 按照给定间隔调整学习率
1) 初始学习率 $lr_{init}=0.1$
2) MultiStepLR参数 $\gamma=0.5$

(1) 学习率为固定常数：
1) 初始学习率 $lr_{init}=0.01$
2) MultiStepLR参数 $\gamma=1$

2.5 -CosineAnnealingLR-余弦周期调整学习率

torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max,eta_min=0,last_epoch=-1)

其中，optimizer为指定的优化器， $T_{max}$ 为学习率下降到最低值所需次数， $eta\_min$ 为为学习率下降下限。
更新公式如下所示：
$\eta_{t}=\eta_{\min }+\frac{1}{2}\left(\eta_{\max }-\eta_{\min }\right)\left(1+\cos \left(\frac{T_{\text {cur }}}{T_{\max }} \pi\right)\right)$ 其中， $\eta_{\min }=eta\_min$ 为设置的最低学习率， $\eta_{t}$ 为更新后的学习率， $\eta_{\max }$ 为初始学习率， $T_{\text {cur }}$ 为当前训练迭代次数， $T_{max}=T\_max$ 为学习率下降到最低值所需次数。

2.6 CosineAnnealingLR调整学习率对网络训练的影响

(1) 余弦周期调整学习率
1) 初始学习率 $\eta_{\max }=0.1$
2) CosineAnnealingLR参数 $eta\_min=0.0001, T_{\max }=20$ ， $T_{\text{max}}$ 表示下降到最小学习率 $eta\_min$ 所需迭代次数。

(1) 学习率为固定常数：

1) 初始学习率 $\eta_{\max }=0.01$

2.7 -ReduceLRonPlateau-监控某种特定的指标(如，校验损失)

torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08, verbose=False)

参数：
(1) optimizer为指定的优化器
(2) $m o d e$ 监控模式，min-指标不在下降时更新学习率，max-指标不在增大时更新学习率
(3) factor为学习率调整因子；patience-连续patience次指标不下降或不上升(取决于mode)时，就更新学习率
(4) cooldown：每更新一次学习率后停止监控一段时间(cooldown=5，更新学习率后间隔5个epoch再继续监控相应指标
(5) verbose：是否打印日志-布尔变量，默认False
(6) min_lr：学习率下限
(7) eps：学习率衰减最小值
满足条件后，更新公式如下所示：
$lr_{new}=lr_{init}*factor$ 其中， $lr_{init}$ 为初始学习率， $lr_{new}$ 为更新后的学习率， $e p o c h$ 为当前训练迭代次数。

import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ExponentialLR, MultiStepLR, CosineAnnealingLR, ReduceLROnPlateau
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
plt.rcParams['font.sans-serif'] = 'SimHei'         # 显示中文
plt.rcParams['axes.unicode_minus'] = False         # 显示负号
SEED = 40                                          # 设置随机种子，使得每次运行时产生的随机数一致


# 构建网络
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear = nn.Linear(50, 100)
        self.leakrelu = nn.LeakyReLU(0.001)
        self.linear1 = nn.Linear(100, 50)
        self.elu = nn.ELU()

    # 获取模型权重和偏置参数，并封装未params_group格式
    def get_params(self, weight_decay=0.0):
        weights, biases = [], []
        for name, param in self.named_parameters():
            if 'bias' in name:
                biases += [param]
            else:
                weights += [param]
        params = [{
            'params': weights,
            'weight_decay': weight_decay,
        }, {
            'params': biases,
            'weight_decay': 0.0,
        }]
        return params

    # 前向传播
    def forward(self, input):
        x = self.leakrelu(self.linear(input))
        output = self.leakrelu(self.linear1(x))
        return output


lr = []                              # 保存每个epoch的学习率
epochTrainLoss = []                  # 保存每个epoch的训练损失
epochValLoss = []                    # 保存每个epoch的校验损失
maxepoch = 50
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

# 产生训练所需数据
input_train = torch.randn(10, 50)      # 训练输入数据
target_train = torch.randn(10, 50)     # 训练标签
input_val = torch.randn(10, 50)        # 校验输入数据
target_val = torch.randn(10, 50)       # 校验所用标签

net = Net()                          # 初始化网络
optimizer = torch.optim.Adam(net.parameters(), lr=0.1)   # 初始化优化器-Adam
scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=1, verbose=True)

loss_fn = torch.nn.MSELoss()         # 初始化损失函数
loss_fn.to(DEVICE)


# ================================== 训练 ===================================#
for epoch in range(maxepoch):
    lr.append(optimizer.param_groups[0]["lr"])        # 获取当前优化器中使用的学习率
    print(epoch, optimizer.param_groups[0]["lr"])
    # 将训和校验数据和模型加载到device中
    net = net.to(DEVICE)
    input_train = input_train.to(DEVICE)
    target_train = target_train.to(DEVICE)
    input_val = input_val.to(DEVICE)
    target_val = target_val.to(DEVICE)

    net.train()                 # 进入训练模式
    net.zero_grad()             # 将上一次训练保存的梯度清零
    output = net(input_train)   # 将输入数据输入到模型中
    trainLoss = loss_fn(output, target_train)         # 计算训练损失
    trainLoss.backward()        # 反向传播
    epochTrainLoss.append(trainLoss.cpu().detach().numpy())
    optimizer.step()            # 更新模型参数

    net.eval()                  # 进入测试模式
    with torch.no_grad():       # 不考虑梯度
        output_val = net(input_val)
    valLoss = loss_fn(output_val, target_val)     # 校验损失
    epochValLoss.append(valLoss.cpu().detach().numpy())
    scheduler.step(valLoss)     # 更新学习率
    print("epoch = %02d  trainLoss = %.4f valLoss = %.4f" % (epoch, trainLoss, valLoss))


# 绘制损失曲线
plt.figure()
x = list(range(maxepoch))
plt.plot(x, lr)
plt.title('学习率变化曲线图')
plt.xlabel('epochs')
plt.ylabel('lr')
plt.show()

# 绘制损失曲线图
plt.figure()
plt.plot(x, epochTrainLoss)
plt.plot(x, epochValLoss)
plt.title('损失曲线图')
plt.legend(['trainLoss','valLoss'])
plt.xlabel('epochs')
plt.ylabel('Loss')
plt.show()

注意：(1) ReduceLRonPlateau中无get_last_lr()方法
(2) scheduler.best 保存着当前模型中的指标(如， $v a l L o s s$ )最小模型

2.8 ReduceLRonPlateau调整学习率对网络训练的影响

(1) 按特定指标调整学习率
1) 初始学习率 $\eta_{\max }=0.1$
2) CosineAnnealingLR参数 $eta\_min=0.0001, T_{\max }=20$

(2) 学习率为固定常数：
1) 初始学习率 $\eta_{\max }=0.01$

2.9 LambdaLR-自定义函数调整学习率

torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=- 1, verbose=False)

参数：
(1) optimizer封装好的优化器
(2) lr_lambda (function or list) –为函数或此类函数的列表，列表的长度由optimator.param_groups中的参数组决定。
(3) verbose-是否将更改学习的信息输出至控制台，默认False。
(4) last_epoch (int) – 最后一个迭代epoch的索引. Default: -1.

# ====================== 一、当lr_lambda=lambda1时============================== #
import torch
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import LambdaLR
plt.rcParams['font.sans-serif'] = 'SimHei'         # 显示中文
plt.rcParams['axes.unicode_minus'] = False         # 显示负号

lr = []  # 保存每个epoch的学习率
maxepoch = 50
params = [torch.nn.Parameter(torch.randn(4, 4, requires_grad=True))]
# params = [torch.nn.parameter.Parameter(torch.randn(4, 4, requires_grad=True))]
optimizer = torch.optim.Adam(params, lr=0.1)  # 初始化优化器-Adam

# 自定义学习率调整函数
lambda1 = lambda epoch: epoch // 10
lambda2 = lambda epoch: 0.2 ** epoch
scheduler = LambdaLR(optimizer, lr_lambda=lambda2, verbose=True)

# 训练
for epoch in range(maxepoch):
    optimizer.zero_grad()
    lr.append(scheduler.get_last_lr()[0])
    print(epoch, scheduler.get_last_lr()[0])
    optimizer.step()
    scheduler.step()

# 绘制学习率变化曲线
plt.figure()
x = list(range(maxepoch))
plt.plot(x, lr)
plt.title('学习率变化曲线图')
plt.xlabel('epochs')
plt.ylabel('lr')
plt.show()

# ====================== 二、当lr_lambda=[lambda1, lambda2]时============================== #
import torch
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ExponentialLR, MultiStepLR, CosineAnnealingLR, ReduceLROnPlateau, LambdaLR
plt.rcParams['font.sans-serif'] = 'SimHei'         # 显示中文
plt.rcParams['axes.unicode_minus'] = False         # 显示负号

lr1 = []  # 保存每个epoch的学习率
lr2 = []
maxepoch = 50
# 构造模型参数
params1 = [torch.nn.Parameter(torch.randn(4, 4, requires_grad=True))]
params2 = [torch.nn.Parameter(torch.randn(4, 4, requires_grad=True))]
# params = [torch.nn.parameter.Parameter(torch.randn(4, 4, requires_grad=True))]
optimizer = torch.optim.Adam([{'params': params1},
                              {'params': params2}], lr=0.1)  # 初始化优化器-Adam

# 自定义学习率调整函数
lambda1 = lambda epoch: 1/(epoch+1)
lambda2 = lambda epoch: 0.2 ** epoch
scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2], verbose=True)

# 训练
for epoch in range(maxepoch):
    optimizer.zero_grad()
    lr1.append(scheduler.get_last_lr()[0])
    lr2.append(scheduler.get_last_lr()[1])
    print(epoch, scheduler.get_last_lr()[0])
    print(epoch, scheduler.get_last_lr()[1])
    optimizer.step()
    scheduler.step()

print(scheduler.state_dict())

# 绘制学习率变化曲线
plt.figure()
plt.subplot(121)
x = list(range(maxepoch))
plt.plot(x, lr1)
plt.title('学习率变化曲线图')
plt.xlabel('epochs')
plt.ylabel('lr')
plt.legend(['lambda1'])
plt.subplot(122)
plt.plot(x, lr2)
plt.title('学习率变化曲线图')
plt.legend(['lambda2'])
plt.xlabel('epochs')
plt.ylabel('lr')
plt.show()

2.10 LambdaLR学习率调整结果展示

(1) 使用单一自定义函数改变学习率

(2) 使用多给自定义函数更改学习率

2.11 StepLR学习率调整结果展示

# scheduler = StepLR(optimizer, step_size=20, gamma=0.1)
# Assuming optimizer uses lr = 0.1 for all groups
# lr = 0.05     if epoch < 20
# lr = 0.005    if 20 <= epoch < 40
# lr = 0.0005   if 40 <= epoch < 50    maxepoch=50
import torch
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import StepLR
plt.rcParams['font.sans-serif'] = 'SimHei'         # 显示中文
plt.rcParams['axes.unicode_minus'] = False         # 显示负号

lr = []  # 保存每个epoch的学习率
maxepoch = 50
params = [torch.nn.Parameter(torch.randn(4, 4, requires_grad=True))]
# params = [torch.nn.parameter.Parameter(torch.randn(4, 4, requires_grad=True))]
optimizer = torch.optim.Adam(params, lr=0.1)  # 初始化优化器-Adam

# 自定义学习率调整函数
scheduler = StepLR(optimizer, step_size=20, gamma=0.1)

for epoch in range(maxepoch):
    optimizer.zero_grad()
    lr.append(scheduler.get_last_lr()[0])
    print(epoch, scheduler.get_last_lr()[0])
    optimizer.step()
    scheduler.step()
print(scheduler.state_dict())