引言

  本文学习优化器optimizer的基本属性、基本方法和作用

一、什么是优化器?

  pytorch的优化器:管理更新模型中可学习参数的值,使得模型输出更接近真实标签。通俗一点,就是采样梯度更新模型的可学习参数,使得损失减小。

二、optimizer的基本属性

class Optimizer(object):
	def __init__(self, params, defaults):
		self.defaults = defaults
		self.state = defaultdict(dict)
		self.param_groups = []
		...
		param_groups = [{'params': param_groups}]
  • defaults:优化器超参数
  • state:参数的缓存,如momentum的缓存
  • params_groups:管理的参数组
  • _step_count:记录更新次数,学习率调整中使用

三、optimizer的基本方法

class Optimizer(object):
	def __init__(self, params, defaults):
		self.defaults = defaults
		self.state = defaultdict(dict)
		self.param_groups = []
		...
		param_groups = [{'params': param_groups}]
		
	def zero_grad(self):
		for group in self.param_groups:
			for p in group['params']:
				if p.grad is not None:
					p.grad.detach_()
					# 清零
					p.grad.zero_()

	def add_param_group(self, param_group):
		for group in self.param_groups:
			param_set.update(set(group['params’]))
		...		
		self.param_groups.append(param_group)

	def state_dict(self):
		...
		return {
		'state': packed_state,
		 'param_groups': param_groups, }
		 
	def load_state_dict(self, state_dict):
		...
  • zero_grad():清空所管理参数的梯度
    pytorch特性:张量梯度不自动清零,会将张量梯度累加;因此,需要在使用完梯度之后,或者在反向传播前,将梯度自动清零
  • step():执行一步更新
  • add_param_group():添加参数组,例如:可以为特征提取层与全连接层设置不同的学习率或者别的超参数
  • state_dict():获取优化器当前状态信息字典
    长时间的训练,会隔一段时间保存当前的状态信息,用来在断点的时候恢复训练,避免由于意外的原因导致模型的终止
  • load_state_dict() :加载状态信息字典

四、方法实例

1.optimizer.step()
import torch
import random
import numpy as np
import torch.optim as optim

def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(1)  # 设置随机种子

weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))

optimizer = optim.SGD([weight], lr=0.1)


print("weight before step:{}".format(weight.data))
# 梯度一步更新
optimizer.step()        
print("weight after step:{}".format(weight.data))
weight before step:tensor([[0.6614, 0.2669],
        [0.0617, 0.6213]])
weight after step:tensor([[ 0.5614,  0.1669],
        [-0.0383,  0.5213]])
2. optimizer.zero_grad()
import torch
import random
import numpy as np
import torch.optim as optim

def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(1)  # 设置随机种子

weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))

optimizer = optim.SGD([weight], lr=0.1)


print("weight before step:{}".format(weight.data))
# 梯度一步更新
optimizer.step()       
print("weight after step:{}".format(weight.data))
# 地址相同
print("weight in optimizer:{}\nweight in weight:{}\n".format(id(optimizer.param_groups[0]['params'][0]), id(weight)))

print("weight.grad is {}\n".format(weight.grad))
# 将梯度清零
optimizer.zero_grad()
print("after optimizer.zero_grad(), weight.grad is\n{}".format(weight.grad))
weight before step:tensor([[0.6614, 0.2669],
        [0.0617, 0.6213]])
weight after step:tensor([[ 0.5614,  0.1669],
        [-0.0383,  0.5213]])
weight in optimizer:2063731163904
weight in weight:2063731163904
weight.grad is tensor([[1., 1.],
        [1., 1.]])
after optimizer.zero_grad(), weight.grad is
tensor([[0., 0.],
        [0., 0.]])
3. optimizer.add_param_group()
import torch
import random
import numpy as np
import torch.optim as optim

def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(1)  # 设置随机种子

weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))

optimizer = optim.SGD([weight], lr=0.1)

print("optimizer.param_groups is\n{}".format(optimizer.param_groups))

w2 = torch.randn((3, 3), requires_grad=True)
# 添加参数组,设置不同参数有不同的学习率
optimizer.add_param_group({"params": w2, 'lr': 0.0001})

print("optimizer.param_groups is\n{}".format(optimizer.param_groups))
optimizer.param_groups is
[{'params': [tensor([[0.6614, 0.2669],
        [0.0617, 0.6213]], requires_grad=True)], 'lr': 0.1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}]
optimizer.param_groups is
[{'params': [tensor([[0.6614, 0.2669],
        [0.0617, 0.6213]], requires_grad=True)], 'lr': 0.1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}, {'params': [tensor([[-0.4519, -0.1661, -1.5228],
        [ 0.3817, -1.0276, -0.5631],
        [-0.8923, -0.0583, -0.1955]], requires_grad=True)], 'lr': 0.0001, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}]
4. optimizer.state_dict()
import torch
import random
import numpy as np
import torch.optim as optim

def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(1)  # 设置随机种子

weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))

optimizer = optim.SGD([weight], lr=0.1)

optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
# 用于保存优化器的状态信息,通常用于断点的续训练
opt_state_dict = optimizer.state_dict()

print("state_dict before step:\n", opt_state_dict)

for i in range(10):
    optimizer.step()
# 获取优化器当前状态信息字典
print("state_dict after step:\n", optimizer.state_dict())
# 将状态信息保存下来
torch.save(optimizer.state_dict(), os.path.join('..', "optimizer_state_dict.pkl"))
state_dict before step:
 {'state': {}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [0]}]}
state_dict after step:
 {'state': {0: {'momentum_buffer': tensor([[6.5132, 6.5132],
        [6.5132, 6.5132]])}}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [0]}]}
5. optimizer.load_state_dict()
import torch
import random
import numpy as np
import torch.optim as optim

def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(1)  # 设置随机种子

weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))

optimizer = optim.SGD([weight], lr=0.1)

# 加载文件
state_dict = torch.load(os.path.join('..', "optimizer_state_dict.pkl"))

print("state_dict before load state:\n", optimizer.state_dict())
# 加载状态信息字典
optimizer.load_state_dict(state_dict)
print("state_dict after load state:\n", optimizer.state_dict())
state_dict before load state:
 {'state': {}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [0]}]}
state_dict after load state:
 {'state': {0: {'momentum_buffer': tensor([[6.5132, 6.5132],
        [6.5132, 6.5132]])}}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [0]}]}

五、优化器中的常用参数

1.learning rate 学习率

梯度下降:
𝒘 𝒊 + 𝟏 = 𝒘 𝒊 − 𝒈 ( 𝒘 𝒊 ) 𝒘 𝒊 + 𝟏 = 𝒘 𝒊 − l r ∗ 𝒈 ( 𝒘 𝒊 ) 𝒘_{𝒊+𝟏} = 𝒘_𝒊 − 𝒈(𝒘_𝒊 )\\𝒘_{𝒊+𝟏} = 𝒘_𝒊 − lr * 𝒈(𝒘_𝒊) wi+1=wig(wi)wi+1=wilrg(wi)
学习率(learning rate)控制更新的步伐
需要

import torch
import numpy as np
import matplotlib.pyplot as plt
torch.manual_seed(1)

def func(x_t):
    """
    y = (2x)^2 = 4*x^2      dy/dx = 8x
    """
    return torch.pow(2*x_t, 2)


# init
x = torch.tensor([2.], requires_grad=True)

lr = 0.01    
    max_iteration = 20   

    for i in range(max_iteration):

        y = func(x)
        y.backward()

        # x.detach().numpy():x中含有梯度信息,先去除梯度信息,再转化为numpy格式
        print("Iter:{}, X:{:8}, X.grad:{:8}, loss:{:10}".format(
            i, x.detach().numpy()[0], x.grad.detach().numpy()[0], y.item()))

        x_rec.append(x.item())
		
		 # x -= x.grad  数学表达式意义:  x = x - x.grad   
        x.data.sub_(lr * x.grad)   
        x.grad.zero_()
Iter:0, X:     2.0, X.grad:    16.0, loss:      16.0
Iter:1, X:1.840000033378601, X.grad:14.720000267028809, loss:13.542400360107422
Iter:2, X:1.6928000450134277, X.grad:13.542400360107422, loss:11.462287902832031
Iter:3, X:1.5573760271072388, X.grad:12.45900821685791, loss:9.701680183410645
Iter:4, X:1.432785987854004, X.grad:11.462287902832031, loss:8.211503028869629
Iter:5, X:1.3181631565093994, X.grad:10.545305252075195, loss:6.950216293334961
Iter:6, X:1.2127101421356201, X.grad:9.701681137084961, loss:5.882663726806641
Iter:7, X:1.1156933307647705, X.grad:8.925546646118164, loss:4.979086399078369
Iter:8, X:1.0264378786087036, X.grad:8.211503028869629, loss:4.214298725128174
Iter:9, X:0.9443228244781494, X.grad:7.554582595825195, loss:3.5669822692871094
Iter:10, X:0.8687769770622253, X.grad:6.950215816497803, loss:3.0190937519073486
Iter:11, X:0.7992748022079468, X.grad:6.394198417663574, loss:2.555360794067383
Iter:12, X:0.7353328466415405, X.grad:5.882662773132324, loss:2.1628575325012207
Iter:13, X:0.6765062212944031, X.grad:5.412049770355225, loss:1.8306427001953125
Iter:14, X:0.6223857402801514, X.grad:4.979085922241211, loss:1.549456000328064
Iter:15, X:0.5725948810577393, X.grad:4.580759048461914, loss:1.3114595413208008
Iter:16, X:0.526787281036377, X.grad:4.214298248291016, loss:1.110019326210022
Iter:17, X:0.4846442937850952, X.grad:3.8771543502807617, loss:0.9395203590393066
Iter:18, X:0.4458727538585663, X.grad:3.5669820308685303, loss:0.795210063457489
Iter:19, X:0.41020292043685913, X.grad:3.281623363494873, loss:0.673065721988678
2.momentum 动量

  momentum 动量:结合当前梯度与上一次更新信息,用于当前更新。pytorch中更新公式为:
v i = m ∗ v i − 1 + g ( w i ) w i + 1 = w i − l r ∗ v i v_i=m*v_{i-1}+g(w_i)\\w_{i+1}=w_i-lr*v_i vi=mvi1+g(wi)wi+1=wilrvi
v i v_i vi:更新量
m m m:momentum系数,通常设置为0.9
g ( w i ) g(w_i) g(wi) w i w_i wi的梯度
v i v_i vi有两部分组成, v i v_i vi直接依赖于 v i − 1 v_{i-1} vi1 g ( w i ) g(w_i) g(wi),而不仅仅是 g ( w i ) g(w_i) g(wi)

𝒗 𝟏 𝟎 𝟎 = 𝒎 ∗ 𝒗 𝟗 𝟗 + 𝒈 ( 𝒘 𝟏 𝟎 𝟎 ) = 𝒈 ( 𝒘 𝟏 𝟎 𝟎 ) + 𝒎 ∗ ( 𝒎 ∗ 𝒗 𝟗 𝟖 + 𝒈 ( 𝒘 𝟗 𝟗 ) ) = 𝒈 ( 𝒘 𝟏 𝟎 𝟎 ) + 𝒎 ∗ 𝒈 ( 𝒘 𝟗 𝟗 ) + 𝒎 𝟐 ∗ 𝒗 𝟗 𝟖 = 𝒈 ( 𝒘 𝟏 𝟎 𝟎 ) + 𝒎 ∗ 𝒈 ( 𝒘 𝟗 𝟗 ) + 𝒎 𝟐 ∗ 𝒈 ( 𝒘 𝟗 𝟖 ) + 𝒎 𝟑 ∗ 𝒗 99 𝒗_{𝟏𝟎𝟎} = 𝒎 ∗ 𝒗_{𝟗𝟗} + 𝒈(𝒘_{𝟏𝟎𝟎}) \\= 𝒈(𝒘_{𝟏𝟎𝟎}) + 𝒎 ∗ (𝒎 ∗ 𝒗_{𝟗𝟖} + 𝒈(𝒘_{𝟗𝟗})) \\= 𝒈(𝒘_{𝟏𝟎𝟎}) + 𝒎 ∗ 𝒈(𝒘_{𝟗𝟗}) + 𝒎^𝟐 ∗ 𝒗_{𝟗𝟖} \\= 𝒈(𝒘_{𝟏𝟎𝟎}) + 𝒎 ∗ 𝒈(𝒘_{𝟗𝟗}) + 𝒎^𝟐 ∗ 𝒈(𝒘_{𝟗𝟖}) + 𝒎^𝟑 ∗ 𝒗_{99} v100=mv99+g(w100)=g(w100)+m(mv98+g(w99))=g(w100)+mg(w99)+m2v98=g(w100)+mg(w99)+m2g(w98)+m3v99
可以看到越往前梯度信息的作用就越小。

import torch
import numpy as np
import torch.optim as optim
import matplotlib.pyplot as plt
torch.manual_seed(1)

def func(x):
    return torch.pow(2*x, 2)    # y = (2x)^2 = 4*x^2        dy/dx = 8x

iteration = 100
m = 0.63

lr_list = [0.01, 0.03]

momentum_list = list()
loss_rec = [[] for l in range(len(lr_list))]
iter_rec = list()

for i, lr in enumerate(lr_list):
    x = torch.tensor([2.], requires_grad=True)

    momentum = 0. if lr == 0.03 else m
    momentum_list.append(momentum)

    optimizer = optim.SGD([x], lr=lr, momentum=momentum)

    for iter in range(iteration):

        y = func(x)
        y.backward()

        optimizer.step()
        optimizer.zero_grad()

        loss_rec[i].append(y.item())

for i, loss_r in enumerate(loss_rec):
    plt.plot(range(len(loss_r)), loss_r, label="LR: {} M:{}".format(lr_list[i], momentum_list[i]))
plt.legend()
plt.xlabel('Iterations')
plt.ylabel('Loss value')
plt.show()

在这里插入图片描述

六、Pytorch十种优化器简介

  1. optim.SGD:随机梯度下降法

    optim.SGD(params, lr=<object object>, 
    		momentum=0, dampening=0, 
    		weight_decay=0, nesterov=False)
    

    主要参数:

    • params:管理的参数组
    • lr:初始学习率
    • momentum:动量系数
    • weight_decay:L2正则化系数
    • nesterov:是否采用NAG
  2. optim.Adagrad:自适应学习率梯度下降法

  3. optim.RMSprop: Adagrad的改进

  4. optim.Adadelta: Adagrad的改进

  5. optim.Adam:RMSprop结合Momentum

  6. optim.Adamax:Adam增加学习率上限

  7. optim.SparseAdam:稀疏版的Adam

  8. optim.ASGD:随机平均梯度下降

  9. optim.Rprop:弹性反向传播

  10. optim.LBFGS:BFGS的改进

上述优化器的使用可参考:torch.optim
SGD与Adam是两种最常用的方式。PyTorch官方文档介绍的非常详细!


如果对您有帮助,麻烦点赞关注,这真的对我很重要!!!如果需要互关,请评论或者私信!
在这里插入图片描述


Logo

为开发者提供学习成长、分享交流、生态实践、资源工具等服务,帮助开发者快速成长。

更多推荐