机器学习算法之鸢尾花数据SVM分类

import numpy as npimport pandas as pdimport matplotlib as mplimport matplotlib.pyplot as pltimport warningsfrom sklearn import svm #svm导入from sklearn.model_selection import train_test_splitfrom sklear

Mr Robot

4363人浏览 · 2021-07-16 16:09:06

Mr Robot · 2021-07-16 16:09:06 发布

编译器：「你有个错误。」
女程序员：「不可能！」
编译器：「你听我解释。」
女程序员：「我不听我不听我不听！」
编译器：「……」
女程序员：「你是不是不爱我了？你肯定和别的程序员好上了！」
在这里插入图片描述

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
from sklearn import svm #svm导入
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ChangedBehaviorWarning

## 设置属性防止中文乱码
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

warnings.filterwarnings('ignore', category=ChangedBehaviorWarning)

## 读取数据
# 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'
path = './datas/iris.data'  # 数据文件路径
data = pd.read_csv(path, header=None)
x, y = data[list(range(4))], data[4]
y = pd.Categorical(y).codes #把文本数据进行编码，比如a b c编码为 0 1 2
x = x[[0, 1]]

svm.SVC API说明：

功能：使用SVM分类器进行模型构建

参数说明：

C: 误差项的惩罚系数，默认为1.0；一般为大于0的一个数字，C越大表示在训练过程中对于总误差的关注度越高，也就是说当C越大的时候，对于训练集的表现会越好，

但是有可能引发过度拟合的问题(overfiting)

kernel：指定SVM内部函数的类型，可选值：linear、poly、rbf、sigmoid、precomputed(基本不用，有前提要求，要求特征属性数目和样本数目一样)；默认是rbf；

degree：当使用多项式函数作为svm内部的函数的时候，给定多项式的项数，默认为3

gamma：当SVM内部使用poly、rbf、sigmoid的时候，核函数的系数值，当默认值为auto的时候，实际系数为1/n_features

coef0: 当核函数为poly或者sigmoid的时候，给定的独立系数，默认为0

probability：是否启用概率估计，默认不启动，不太建议启动

shrinking：是否开启收缩启发式计算，默认为True

tol: 模型构建收敛参数，当模型的的误差变化率小于该值的时候，结束模型构建过程，默认值:1e-3

cache_size：在模型构建过程中，缓存数据的最大内存大小，默认为空，单位MB

class_weight：给定各个类别的权重，默认为空

max_iter：最大迭代次数，默认-1表示不限制

decision_function_shape: 决策函数，可选值：ovo和ovr，默认为None；推荐使用ovr；（1.7以上版本才有）

‘’’

## 数据分割
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, train_size=0.8)

# 数据SVM分类器构建
clf = svm.SVC(C=1,kernel='rbf',gamma=0.1)
#gamma值越大，训练集的拟合就越好，但是会造成过拟合，导致测试集拟合变差
#gamma值越小，模型的泛化能力越好，训练集和测试集的拟合相近，但是会导致训练集出现欠拟合问题，
#从而，准确率变低，导致测试集准确率也变低。
## 模型训练
clf.fit(x_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma=0.1, kernel=‘rbf’,
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)

## 计算模型的准确率/精度
print (clf.score(x_train, y_train)) 
print ('训练集准确率：', accuracy_score(y_train, clf.predict(x_train)))
print (clf.score(x_test, y_test))
print ('测试集准确率：', accuracy_score(y_test, clf.predict(x_test)))

## 计算决策函数的结构值以及预测值(decision_function计算的是样本x到各个分割平面的距离<也就是决策函数的值>)
print ('decision_function:\n', clf.decision_function(x_train))
print ('\npredict:\n', clf.predict(x_train))

0.85
训练集准确率： 0.85
0.7333333333333333
测试集准确率： 0.7333333333333333
decision_function:
[[-1.2115785 -0.96426875 -0.44139404]
[-0.0212214 0.32350899 0.99112326]
[ 1.00005673 1.01718054 1.05204952]
[-0.99994331 -0.66528842 0.02120321]
[-1.60266762 -1.19125662 -0.34300204]
[-0.16521486 0.35408006 1.31104281]
[ 0.69279181 0.57417729 0.31720605]
[-1.28313944 -0.76470258 0.26329009]
[-0.73890579 -0.39147577 0.31044165]
[-1.15777559 -0.56660164 0.57617025]
[-1.69871586 -1.45938381 -0.94256326]
[ 1.25806731 1.39535824 1.63982539]
[-1.64728822 -1.47365095 -1.09249207]
[ 1.2765006 1.27246874 1.26435909]
[ 0.98788955 0.90604414 0.73689976]
[-0.78691903 0.09648912 1.5804225 ]
[-1.44630029 -1.15797742 -0.54881233]
[-1.23934761 -1.13345014 -0.90129927]
[-1.86259623 -1.07788983 0.4081823 ]
[-2.17515331 -1.67200749 -0.64112176]
[-1.01772962 -0.36197254 0.87193767]
[-1.97619342 -1.91014185 -1.7613738 ]
[-0.82090465 -0.673922 -0.36140678]
[-1.00753075 -0.29489718 1.0209052 ]
[-1.62495298 -1.14370562 -0.16736106]
[-1.50022637 -1.07383779 -0.20111733]
[-0.01312801 0.59035511 1.6416271 ]
[-1.85741788 -1.27006502 -0.09971918]
[-1.59045698 -1.47962414 -1.2320166 ]
[-1.44630029 -1.15797742 -0.54881233]
[-0.29306176 -0.18680416 0.03534241]
[ 0.98731335 1.20276475 1.59327218]
[-1.0244391 -0.4918598 0.5471533 ]
[-0.74760311 -0.21008981 0.8205251 ]
[-0.46201608 -0.04764132 0.76184941]
[-0.87800513 -0.22229661 1.00000019]
[-0.99994331 -0.66528842 0.02120321]
[-1.69335567 -1.61484419 -1.43613804]
[ 1.12917983 1.03954337 0.85648678]
[ 1.44876802 1.6340948 1.92482686]
[-1.72419296 -1.25915183 -0.3087661 ]
[-0.45981896 -0.10741381 0.59328626]
[ 0.56943472 0.66322619 0.85224295]
[ 1.38039434 0.99961444 0.12663756]
[-1.64728822 -1.47365095 -1.09249207]
[ 1.24380025 1.42606916 1.74194508]
[-0.73890579 -0.39147577 0.31044165]
[-0.30086627 0.32948786 1.45572233]
[ 1.6459113 1.64657025 1.6384621 ]
[-2.06488029 -1.31235155 0.13535878]
[-1.22523666 -1.30742315 -1.49858872]
[-0.60530889 -0.19100546 0.62530803]
[ 0.84367734 1.12217663 1.62297473]
[-1.8714472 -1.68858016 -1.28520261]
[-1.5256875 -1.39925107 -1.11922193]
[-1.44630029 -1.15797742 -0.54881233]
[-1.60266762 -1.19125662 -0.34300204]
[ 1.52714388 1.36460329 1.03547425]
[ 1.40408568 1.43422731 1.48755559]
[-1.18103576 -0.99958452 -0.61027816]
[-1.23934761 -1.13345014 -0.90129927]
[ 0.25929317 0.89679742 1.91970846]
[-0.55593927 -0.45758096 -0.24948117]
[ 0.70630023 0.99998501 1.53652905]
[-1.15398059 -0.50072946 0.73959317]
[-1.64728822 -1.47365095 -1.09249207]
[ 1.60772083 1.69326378 1.82520211]
[ 1.40843183 1.34597872 1.22292699]
[-1.910839 -1.74764677 -1.38525956]
[ 1.14005949 1.14715935 1.16163088]
[ 1.34076509 1.54647062 1.88226565]
[ 1.24332398 1.0551231 0.65786632]
[-1.13124841 -0.36393017 1.03934418]
[-1.45214967 -1.3106943 -0.99999991]
[-1.97989533 -1.89510888 -1.70355504]
[ 1.14005949 1.14715935 1.16163088]
[ 0.85148441 1.08173824 1.50621779]
[ 0.85969884 0.93730958 1.09101573]
[-0.74827888 -0.27258407 0.65612649]
[-1.53893835 -1.2658354 -0.6845997 ]
[ 0.85678378 1.03711259 1.37768925]
[ 0.85952851 0.98879 1.23876831]
[-1.47623331 -1.11920105 -0.37665021]
[ 0.56943472 0.66322619 0.85224295]
[-2.27348422 -2.07130609 -1.63579055]
[-0.14997309 0.51145788 1.65249668]
[-1.23156876 -1.42481069 -1.90290001]
[-1.49820868 -1.29210591 -0.84651752]
[ 1.73328128 1.43317509 0.79982627]
[-0.87204973 -0.53006191 0.16619215]
[ 0.80890971 0.59580447 0.12126086]
[-2.13575164 -2.00125643 -1.70520822]
[ 1.52714388 1.36460329 1.03547425]
[ 1.25806731 1.39535824 1.63982539]
[-2.14630444 -1.94433764 -1.50115158]
[ 0.99988684 1.11758107 1.34196277]
[-0.67943552 -0.58707901 -0.39044646]
[-0.74827888 -0.27258407 0.65612649]
[-1.0244391 -0.4918598 0.5471533 ]
[-1.02339162 -0.4278724 0.71343596]
[-1.07489736 -0.88228938 -0.47210779]
[-1.62317236 -1.3664538 -0.81609555]
[-1.85741788 -1.27006502 -0.09971918]
[-1.62317236 -1.3664538 -0.81609555]
[ 0.56512685 0.87249231 1.44195107]
[-1.00753075 -0.29489718 1.0209052 ]
[-1.49820868 -1.29210591 -0.84651752]
[-1.0244391 -0.4918598 0.5471533 ]
[ 1.00153058 1.06901707 1.20104727]
[-1.66331804 -1.33817531 -0.65210779]
[-0.31545099 0.03641806 0.7302115 ]
[-0.44805403 -0.22142795 0.24408854]
[-2.06305008 -1.37643799 -0.03319841]
[ 0.56688125 0.60657329 0.68778312]
[ 1.40359182 1.29537905 1.07861418]
[ 0.85148441 1.08173824 1.50621779]
[-1.37076721 -1.00001505 -0.23456702]
[-1.0244391 -0.4918598 0.5471533 ]
[-1.19909031 -1.38321314 -1.83255529]
[ 1.38122138 1.50202626 1.71040112]]

predict:
[2 1 0 1 2 1 0 1 1 1 2 0 2 0 0 1 2 2 1 2 1 2 2 1 2 2 1 2 2 2 1 0 1 1 1 1 1
2 0 0 2 1 0 0 2 0 1 1 0 1 2 1 0 2 2 2 2 0 0 2 2 0 2 0 1 2 0 0 2 0 0 0 1 2
2 0 0 0 1 2 0 0 2 0 2 1 2 2 0 1 0 2 0 0 2 0 2 1 1 1 2 2 2 2 0 1 2 1 0 2 1
1 2 0 0 0 2 1 2 0]

# 画图
N = 500
x1_min, x2_min = x.min()
x1_max, x2_max = x.max()

t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, N)
x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点
grid_show = np.dstack((x1.flat, x2.flat))[0] # 测试点


grid_hat = clf.predict(grid_show)       # 预测分类值
grid_hat = grid_hat.reshape(x1.shape)  # 使之与输入的形状相同

cm_light = mpl.colors.ListedColormap(['#00FFCC', '#FFA0A0', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
plt.figure(facecolor='w')
## 区域图
plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)
## 所以样本点
plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark)      # 样本
## 测试数据集
plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10)     # 圈中测试集样本
## lable列表
plt.xlabel(iris_feature[0], fontsize=13)
plt.ylabel(iris_feature[1], fontsize=13)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title(u'鸢尾花SVM特征分类', fontsize=16)
plt.grid(b=True, ls=':')
plt.tight_layout(pad=1.5)
plt.show()