逻辑回归实现鸢尾花分类

from sklearn.datasets import load_irisimport numpy asnpimport matplotlib.pyplot as pltfrom sklearn.metrics import classification_reportiris=load_iris()#print(iris)print(iris['target_names'])#分类名称data=

是忘生啊

5883人浏览 · 2022-01-17 00:15:00

是忘生啊 · 2022-01-17 00:15:00 发布

from sklearn.datasets import load_iris
import numpy as  np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

iris=load_iris()
#print(iris)
print(iris['target_names'])#分类名称
data=iris.data#样本数据150个样本每个样本4个维度
target=iris.target#分类
print(data.shape)
print(target.shape)

['setosa' 'versicolor' 'virginica']
(150, 4)
(150,)

1.使用前两个特征进行分类

对于多分类问题来说，由于逻辑回归一次只能对两个类别进行分类，所有如果有n个类别需要分类，则需要重新制作y数据集（以类别1为列，将类别1的设置为1，其他类别都设置为0），最终用n次分类得到的n个参数分别与x[i]相运算，那个概率最大则预测出来的y就是那个类别的。

#在这里为方便画图进选用两个特征
x=data[:,0:2]
y=target
a0=[]
b0=[]
a1=[]
b1=[]
a2=[]
b2=[]
for i in range(len(data)):
    if (y[i]==0):
        a0.append(x[i,0])
        b0.append(x[i,1])
    elif(y[i]==1):
        a1.append(x[i,0])
        b1.append(x[i,1])
    else:
        a2.append(x[i,0])
        b2.append(x[i,1])
scatter1=plt.scatter(a0,b0,c='b',marker='o')
scatter2=plt.scatter(a1,b1,c='r',marker='x')
scatter3=plt.scatter(a2,b2,c='y',marker='s')
plt.legend(handles=[scatter1,scatter2,scatter3],labels=['setosa','versicolor','virginica'],loc='best')
plt.show()

在这里插入图片描述

def sigmoid(x):#sigmoid函数
    return 1.0/(1+np.exp(-x))
def cost(x,y,theta):#代价函数
    x=np.matrix(x)
    y=np.matrix(y)
    theta=np.matrix(theta)
    first=np.multiply(y,np.log(sigmoid(x*theta)))
    second=np.multiply(1-y,np.log(1-sigmoid(x*theta)))
    return np.sum(first+second)/(-len(x))
def grad(x,y,theta,epochs=1000,lr=0.001):#进行梯度下降
    x=np.matrix(x)
    y=np.matrix(y)
    theta=np.matrix(theta)
    #print(x.shape,' ',theta.shape)
    m=x.shape[0]
    costList=[]
    for i in range(epochs+1):
        #print('i=',i,' x',x.shape,' theta',theta.shape)
        h=sigmoid(x*theta)
        #print('i=',i,'h ',h.shape,'x.T',x.T.shape,'y',y.shape)
        delta=x.T*(h-y)/m
        #print('i=',i,'delta ',delta.shape)
        theta=theta-lr*delta
        if(i%50==0):
            costList.append(cost(x,y,theta))#计算损失值
    return theta,costList

1.对setosa分类，分出setosa和其他种类

x=np.concatenate((np.ones((len(x),1)),x),axis=1)
theta=np.ones((x.shape[1],1))
#print(theta.shape)
y1=[]#重新生成数据集y，
for i in range(len(x)):
    if y[i]!=0:
        y1.append([0])
    else:
        y1.append([1])
theta,costList=grad(x,y1,theta,3000,0.6)
#print(theta.shape)
a=np.linspace(0,3000,61)#生成61个数
plt.plot(a,costList,c='y')
plt.show()

png

theta

matrix([[ 5.0040943 ],
        [-6.76191622],
        [10.16162326]])

plt.scatter(x[:,1],x[:,2],c=y)
m=[[4.5],[8.0]]
n=(-theta[0]-m*theta[1])/theta[2]
plt.plot(m,n,c='r')
plt.show()

png

可以看出，对于setosa分类与其他分类versicolor virginica可以用两个特征，就可以很好的将其分好类

2.对versicolor分类

theta2=np.ones((x.shape[1],1))
#print(theta.shape)
y2=[]
for i in range(len(x)):
    if y[i]!=1:
        y2.append([0])
    else:
        y2.append([1])
theta2,costList=grad(x,y2,theta2,8000,0.28)
#print(theta.shape)
a=np.linspace(0,6000,161)#生成61个数
plt.plot(a,costList,c='y')
plt.show()

png

theta2

matrix([[ 8.80218283],
        [ 0.25495515],
        [-3.45489597]])

plt.scatter(x[:,1],x[:,2],c=y)
m=[[4.5],[8.0]]
n=(-theta[0]-m*theta[1])/theta[2]
l=(-theta2[0]-m*theta2[1])/theta2[2]
plt.plot(m,n,c='b')
plt.plot(m,l,c='r')
plt.show()

png

3.对virginica分类

theta3=np.ones((x.shape[1],1))
#print(theta.shape)
y3=[]
for i in range(len(x)):
    if y[i]!=2:
        y3.append([0])
    else:
        y3.append([1])
theta3,costList=grad(x,y3,theta3,6000,0.1)
#print(theta.shape)
a=np.linspace(0,6000,121)#生成61个数
plt.plot(a,costList,c='y')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-m6fIt9Sc-1642235863142)(output_14_0.png)]


theta3

matrix([[-6.42102857],
        [ 1.79488608],
        [-1.6847187 ]])

plt.scatter(x[:,1],x[:,2],c=y)
m=[[4.5],[8.0]]
n=(-theta[0]-m*theta[1])/theta[2]
l=(-theta2[0]-m*theta2[1])/theta2[2]
q=(-theta3[0]-m*theta3[1])/theta3[2]
plt.plot(m,n,c='b')
plt.plot(m,l,c='r')
plt.plot(m,q,c='y')
plt.show()

png

从setosa（0），versicolor（1） virginica（2）来看,0分类和其他两个分类可以用一条直线很好的将他们分开，但是对1和1分类来说，一条直线显然无法将其分隔开，所以需要我们考虑多个样本特征。

2.4个特征一起使用来评估模型

#x=np.concatenate((np.ones((len(x),1)),x),axis=1)
X=np.concatenate((np.ones((len(data),1)),data),axis=1)
Y=target
print(X.shape,Y.shape)

(150, 5) (150,)

Theta1=np.ones((X.shape[1],1))
#print(theta.shape)
Y1=[]#重新生成数据集y，
for i in range(len(X)):
    if Y[i]!=0:
        Y1.append([0])
    else:
        Y1.append([1])
Theta1,costList=grad(X,Y1,Theta1,3000,0.6)
#print(theta.shape)
a=np.linspace(0,3000,61)#生成61个数
plt.plot(a,costList,c='y')
plt.show()
Theta1

png

matrix([[ 1.33220841],
        [ 0.52729505],
        [ 3.3411161 ],
        [-4.85587593],
        [-1.63431095]])

Theta2=np.ones((X.shape[1],1))
#print(theta.shape)
Y2=[]#重新生成数据集y，
for i in range(len(X)):
    if Y[i]!=1:
        Y2.append([0])
    else:
        Y2.append([1])
Theta2,costList=grad(X,Y2,Theta2,8000,0.1)
#print(theta.shape)
a=np.linspace(0,3000,161)#生成61个数
plt.plot(a,costList,c='y')
plt.show()
Theta2

png

matrix([[ 4.75032157],
        [ 0.11228482],
        [-2.45581587],
        [ 1.16623831],
        [-2.70084823]])

Theta3=np.ones((X.shape[1],1))
#print(theta.shape)
Y3=[]#重新生成数据集y，
for i in range(len(X)):
    if Y[i]!=2:
        Y3.append([0])
    else:
        Y3.append([1])
Theta3,costList=grad(X,Y3,Theta3,3000,0.6)
#print(theta.shape)
a=np.linspace(0,3000,61)#生成61个数
plt.plot(a,costList,c='y')
plt.show()
Theta3

png

matrix([[-5.40077764],
        [-4.5160182 ],
        [-4.88791515],
        [ 6.77915362],
        [ 8.57906837]])

predict_y1=sigmoid(X*Theta1)
predict_y2=sigmoid(X*Theta2)
predict_y3=sigmoid(X*Theta3)
predict_y=[]
for i in range(len(X)):
    if max(predict_y1[i],predict_y2[i],predict_y3[i])==predict_y1[i]:
        predict_y.append([0])
    elif max(predict_y1[i],predict_y2[i],predict_y3[i])==predict_y2[i]:
        predict_y.append([1])
    else :
        predict_y.append([2])
for i in range(len(X)):
    print(predict_y1[i],' ',predict_y2[i],' ',predict_y3[i],predict_y[i])

print(classification_report(Y,predict_y))#正确率有0.96

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.96      0.96      0.96        50
           2       0.96      0.96      0.96        50

    accuracy                           0.97       150
   macro avg       0.97      0.97      0.97       150
weighted avg       0.97      0.97      0.97       150

3.用sklearn进行逻辑回归

from sklearn.linear_model import LogisticRegression

model=LogisticRegression()
model.fit(data,target)
prediction=model.predict(data)
print(model.intercept_)
print(model.coef_)

[  9.84186228   2.21913963 -12.06100191]
[[-0.41943756  0.96749376 -2.5205723  -1.084326  ]
 [ 0.53147635 -0.3150198  -0.20094963 -0.94785159]
 [-0.11203879 -0.65247397  2.72152193  2.03217759]]

print(classification_report(target,prediction))#准确率0.97

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.98      0.94      0.96        50
           2       0.94      0.98      0.96        50

    accuracy                           0.97       150
   macro avg       0.97      0.97      0.97       150
weighted avg       0.97      0.97      0.97       150