from sklearn.datasets import load_iris
import numpy as  np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
iris=load_iris()
#print(iris)
print(iris['target_names'])#分类名称
data=iris.data#样本数据150个样本每个样本4个维度
target=iris.target#分类
print(data.shape)
print(target.shape)
['setosa' 'versicolor' 'virginica']
(150, 4)
(150,)

1.使用前两个特征进行分类

对于多分类问题来说,由于逻辑回归一次只能对两个类别进行分类,所有如果有n个类别需要分类,则需要重新制作y数据集(以类别1为列,将类别1的设置为1,其他类别都设置为0),最终用n次分类得到的n个参数分别与x[i]相运算,那个概率最大则预测出来的y就是那个类别的。

#在这里为方便画图进选用两个特征
x=data[:,0:2]
y=target
a0=[]
b0=[]
a1=[]
b1=[]
a2=[]
b2=[]
for i in range(len(data)):
    if (y[i]==0):
        a0.append(x[i,0])
        b0.append(x[i,1])
    elif(y[i]==1):
        a1.append(x[i,0])
        b1.append(x[i,1])
    else:
        a2.append(x[i,0])
        b2.append(x[i,1])
scatter1=plt.scatter(a0,b0,c='b',marker='o')
scatter2=plt.scatter(a1,b1,c='r',marker='x')
scatter3=plt.scatter(a2,b2,c='y',marker='s')
plt.legend(handles=[scatter1,scatter2,scatter3],labels=['setosa','versicolor','virginica'],loc='best')
plt.show()

在这里插入图片描述

def sigmoid(x):#sigmoid函数
    return 1.0/(1+np.exp(-x))
def cost(x,y,theta):#代价函数
    x=np.matrix(x)
    y=np.matrix(y)
    theta=np.matrix(theta)
    first=np.multiply(y,np.log(sigmoid(x*theta)))
    second=np.multiply(1-y,np.log(1-sigmoid(x*theta)))
    return np.sum(first+second)/(-len(x))
def grad(x,y,theta,epochs=1000,lr=0.001):#进行梯度下降
    x=np.matrix(x)
    y=np.matrix(y)
    theta=np.matrix(theta)
    #print(x.shape,' ',theta.shape)
    m=x.shape[0]
    costList=[]
    for i in range(epochs+1):
        #print('i=',i,' x',x.shape,' theta',theta.shape)
        h=sigmoid(x*theta)
        #print('i=',i,'h ',h.shape,'x.T',x.T.shape,'y',y.shape)
        delta=x.T*(h-y)/m
        #print('i=',i,'delta ',delta.shape)
        theta=theta-lr*delta
        if(i%50==0):
            costList.append(cost(x,y,theta))#计算损失值
    return theta,costList
1.对setosa分类,分出setosa和其他种类
x=np.concatenate((np.ones((len(x),1)),x),axis=1)
theta=np.ones((x.shape[1],1))
#print(theta.shape)
y1=[]#重新生成数据集y,
for i in range(len(x)):
    if y[i]!=0:
        y1.append([0])
    else:
        y1.append([1])
theta,costList=grad(x,y1,theta,3000,0.6)
#print(theta.shape)
a=np.linspace(0,3000,61)#生成61个数
plt.plot(a,costList,c='y')
plt.show()

png

theta
matrix([[ 5.0040943 ],
        [-6.76191622],
        [10.16162326]])
plt.scatter(x[:,1],x[:,2],c=y)
m=[[4.5],[8.0]]
n=(-theta[0]-m*theta[1])/theta[2]
plt.plot(m,n,c='r')
plt.show()

png

可以看出,对于setosa分类与其他分类versicolor virginica可以用两个特征,就可以很好的将其分好类

2.对versicolor分类
theta2=np.ones((x.shape[1],1))
#print(theta.shape)
y2=[]
for i in range(len(x)):
    if y[i]!=1:
        y2.append([0])
    else:
        y2.append([1])
theta2,costList=grad(x,y2,theta2,8000,0.28)
#print(theta.shape)
a=np.linspace(0,6000,161)#生成61个数
plt.plot(a,costList,c='y')
plt.show()

png

theta2
matrix([[ 8.80218283],
        [ 0.25495515],
        [-3.45489597]])
plt.scatter(x[:,1],x[:,2],c=y)
m=[[4.5],[8.0]]
n=(-theta[0]-m*theta[1])/theta[2]
l=(-theta2[0]-m*theta2[1])/theta2[2]
plt.plot(m,n,c='b')
plt.plot(m,l,c='r')
plt.show()

png

3.对virginica分类
theta3=np.ones((x.shape[1],1))
#print(theta.shape)
y3=[]
for i in range(len(x)):
    if y[i]!=2:
        y3.append([0])
    else:
        y3.append([1])
theta3,costList=grad(x,y3,theta3,6000,0.1)
#print(theta.shape)
a=np.linspace(0,6000,121)#生成61个数
plt.plot(a,costList,c='y')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-m6fIt9Sc-1642235863142)(output_14_0.png)]


theta3
matrix([[-6.42102857],
        [ 1.79488608],
        [-1.6847187 ]])
plt.scatter(x[:,1],x[:,2],c=y)
m=[[4.5],[8.0]]
n=(-theta[0]-m*theta[1])/theta[2]
l=(-theta2[0]-m*theta2[1])/theta2[2]
q=(-theta3[0]-m*theta3[1])/theta3[2]
plt.plot(m,n,c='b')
plt.plot(m,l,c='r')
plt.plot(m,q,c='y')
plt.show()


png

从setosa(0),versicolor(1) virginica(2)来看,0分类和其他两个分类可以用一条直线很好的将他们分开,但是对1和1分类来说,一条直线显然无法将其分隔开,所以需要我们考虑多个样本特征。

2.4个特征一起使用来评估模型

#x=np.concatenate((np.ones((len(x),1)),x),axis=1)
X=np.concatenate((np.ones((len(data),1)),data),axis=1)
Y=target
print(X.shape,Y.shape)
(150, 5) (150,)
Theta1=np.ones((X.shape[1],1))
#print(theta.shape)
Y1=[]#重新生成数据集y,
for i in range(len(X)):
    if Y[i]!=0:
        Y1.append([0])
    else:
        Y1.append([1])
Theta1,costList=grad(X,Y1,Theta1,3000,0.6)
#print(theta.shape)
a=np.linspace(0,3000,61)#生成61个数
plt.plot(a,costList,c='y')
plt.show()
Theta1

png

matrix([[ 1.33220841],
        [ 0.52729505],
        [ 3.3411161 ],
        [-4.85587593],
        [-1.63431095]])
Theta2=np.ones((X.shape[1],1))
#print(theta.shape)
Y2=[]#重新生成数据集y,
for i in range(len(X)):
    if Y[i]!=1:
        Y2.append([0])
    else:
        Y2.append([1])
Theta2,costList=grad(X,Y2,Theta2,8000,0.1)
#print(theta.shape)
a=np.linspace(0,3000,161)#生成61个数
plt.plot(a,costList,c='y')
plt.show()
Theta2

png

matrix([[ 4.75032157],
        [ 0.11228482],
        [-2.45581587],
        [ 1.16623831],
        [-2.70084823]])
Theta3=np.ones((X.shape[1],1))
#print(theta.shape)
Y3=[]#重新生成数据集y,
for i in range(len(X)):
    if Y[i]!=2:
        Y3.append([0])
    else:
        Y3.append([1])
Theta3,costList=grad(X,Y3,Theta3,3000,0.6)
#print(theta.shape)
a=np.linspace(0,3000,61)#生成61个数
plt.plot(a,costList,c='y')
plt.show()
Theta3

png

matrix([[-5.40077764],
        [-4.5160182 ],
        [-4.88791515],
        [ 6.77915362],
        [ 8.57906837]])
predict_y1=sigmoid(X*Theta1)
predict_y2=sigmoid(X*Theta2)
predict_y3=sigmoid(X*Theta3)
predict_y=[]
for i in range(len(X)):
    if max(predict_y1[i],predict_y2[i],predict_y3[i])==predict_y1[i]:
        predict_y.append([0])
    elif max(predict_y1[i],predict_y2[i],predict_y3[i])==predict_y2[i]:
        predict_y.append([1])
    else :
        predict_y.append([2])
for i in range(len(X)):
    print(predict_y1[i],' ',predict_y2[i],' ',predict_y3[i],predict_y[i])

print(classification_report(Y,predict_y))#正确率有0.96
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.96      0.96      0.96        50
           2       0.96      0.96      0.96        50

    accuracy                           0.97       150
   macro avg       0.97      0.97      0.97       150
weighted avg       0.97      0.97      0.97       150

3.用sklearn进行逻辑回归

from sklearn.linear_model import LogisticRegression

model=LogisticRegression()
model.fit(data,target)
prediction=model.predict(data)
print(model.intercept_)
print(model.coef_)
[  9.84186228   2.21913963 -12.06100191]
[[-0.41943756  0.96749376 -2.5205723  -1.084326  ]
 [ 0.53147635 -0.3150198  -0.20094963 -0.94785159]
 [-0.11203879 -0.65247397  2.72152193  2.03217759]]
print(classification_report(target,prediction))#准确率0.97
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.98      0.94      0.96        50
           2       0.94      0.98      0.96        50

    accuracy                           0.97       150
   macro avg       0.97      0.97      0.97       150
weighted avg       0.97      0.97      0.97       150

Logo

华为开发者空间,是为全球开发者打造的专属开发空间,汇聚了华为优质开发资源及工具,致力于让每一位开发者拥有一台云主机,基于华为根生态开发、创新。

更多推荐