Softmax Regression

Concept and Principle

  • softmax操作子
    将输出变为一个概率分布(保证非负性与归一性)
  • 交叉熵损失
    用于衡量两个概率分布的区别,将softmax输出的分布与one-hot形式的标签作为两个分布

Implementation

  • 数据集
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import torch
import torchvision
from torch.utils import data
from torchvision import transforms

def LoadData(batch_size,resize=None):
# 用于把图片转为Tensor,会自动归一化
trans=[transforms.ToTensor()]
if resize:
trans.insert(0,transforms.Resize(resize))
trans=transforms.Compose(trans)

train_data=torchvision.datasets.FashionMNIST(
root="./dataset",train=True,
transform=trans,download=True
)
test_data=torchvision.datasets.FashionMNIST(
root="./dataset",train=False,
transform=trans,download=True
)

# print(test_data[0])

return (data.DataLoader(train_data,batch_size=batch_size,shuffle=True),
data.DataLoader(test_data,batch_size=batch_size,shuffle=False))
train_iter,test_iter=LoadData(256)
  • 从零实现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# 将图片展平
num_inputs=1*28*28
# 共有10类
num_outputs=10

w=torch.normal(0,0.01,size=(num_inputs,num_outputs),requires_grad=True)
b=torch.zeros(num_outputs,requires_grad=True)



def Softmax(X):
# 成batch分子组成的向量
X_exp=torch.exp(X)
# 成batch个分母
partition=X_exp.sum(1,keepdim=True)

# 应用广播机制
return X_exp/partition

def CrossEntropy(y_hat,y):
# 只有y为下标那一项起作用
# 按正常one_hot编码为0的项直接就没算了
# 一个batch的y_hat都取对应y中的值为下标那个
# 最终得到了整个batch中所有样本的损失
return -torch.log(y_hat[range(len(y_hat)),y]).sum()

def Net(w,b,X):
return Softmax(torch.matmul(X,w)+b)

def Sgd(params,lr):
with torch.no_grad():
for i in params:
i-=lr*i.grad
i.grad.zero_()


def Train():
for epoch in range(50):
loss=0
for X,y in train_iter:
X=X.view(-1,784)
out=Net(w,b,X)
l=CrossEntropy(out,y)

l.backward()
# 学习率设置这么小是因为在损失函数或优化器中
# 没有除以batch_size,导致求的梯度会很大
Sgd([w,b],0.0001)
loss=l.item()
print(f"{epoch},{loss}")
  • 简洁实现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
train_iter,test_iter=LoadData(256)

# 将图片展平
num_inputs=1*28*28
# 共有10类
num_outputs=10

from torch import nn,optim
# Flatten() 保留第零维度,其他全部展平
net=nn.Sequential(nn.Flatten(),nn.Linear(num_inputs,num_outputs))
loss_f=nn.CrossEntropyLoss()
opt=optim.Adam(net.parameters())

for epoch in range(50):
loss=0
for X,y in train_iter:
out=net(X)
l=loss_f(out,y)
l.backward()
opt.step()
opt.zero_grad()
loss=l.item()
print(f"{epoch},{loss}")