python机器学习实例
目录KNNKNN分类KNNKNN分类K-近邻算法:当需要表示一个样本(值)的时候,用与该样本最接近的K个邻居来决定K的取值会直接影响到最终的结果
·
学习视频:
链接1: link.
聚类
球员聚类
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
data = pd.read_csv('nba.csv')
# 标准化数据
minmax_scaler = MinMaxScaler()
X = minmax_scaler.fit_transform(data.iloc[:,1:])
# 肘部,根据肘部的对应的值 确定 哪个K比较好
loss = []
for i in range(2,10):
model = KMeans(n_clusters=i).fit(X)
loss.append(model.inertia_)
plt.plot(range(2,10),loss)
plt.xlabel('k')
plt.ylabel('loss')
plt.show()
# 上步确定比较好的 K 的值
k = 4
model = KMeans(n_clusters=k).fit(X)
data['Result_show'] = model.labels_ # 把聚类的结果以数字形式添加在data新建的'Result_show'健值对中
data.head()
# 显示具类结果
for i in range(k):
print('Result_show',i)
result = data[data['Result_show'] == i].iloc[:,0] # 把一类的球队名称赋给result
print(result.values)
KNN
KNN分类
K-近邻算法:当需要表示一个样本(值)的时候,用与该样本最接近的K个邻居来决定
K的取值会直接影响到最终的结果
鸢尾花分类
from sklearn import neighbors
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
iris = datasets.load_iris()
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2)
model = neighbors.KNeighborsClassifier(n_neighbors=3) # n_neighbors=3相当于k=3
model.fit(x_train,y_train)
prediction = model.predict(x_test)
print(classification_report(y_test,prediction))
水果分类
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
data = pd.read_csv('fruit_data.csv')
# print(data)
# 字符串数字化
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0])
print(label_encoder.classes_) # 查询数字化后 每个数字对应更改前的哪个标号
x = data.iloc[:,1:]
y = data.iloc[:,0]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,stratify=y,random_state=20) # random_state=20 表示每次分割的结果是一样的,如果默认值的话,每次分割的结果不同
test_score = [] # 保存不同 k 值测试集的准确率
train_score = [] # 保存不同 k 值训练集的准确率
k = 30
for i in range(1,k):
knn = KNeighborsClassifier(i)
knn.fit(x_train,y_train)
test_score.append(knn.score(x_test,y_test))
train_score.append(knn.score(x_train,y_train))
# 绘制训练结果
plt.plot(range(1,k),test_score,label="Test")
plt.plot(range(1,k),train_score,label="Train")
plt.legend() # 加图例
plt.xlabel('k')
plt.ylabel('accuracy')
plt.xticks(range(1,k)) # 设置x轴坐标取值
plt.show()
# 选择最好的K值,并评估
k = np.argmax(test_score) + 1
knn = KNeighborsClassifier(k)
knn.fit(x_train,y_train)
print(k)
print(knn.score(x_test,y_test))
一元线性回归
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
x = [13854,12213,11009,10655,9503] #程序员工资,顺序为北京,上海,杭州,深圳,广州
x = np.reshape(x,newshape=(5,1)) / 10000.0
y = [21332, 20162, 19138, 18621, 18016] #算法工程师,顺序和上面一致
y = np.reshape(y,newshape=(5,1)) / 10000.0
# 调用模型
lr = LinearRegression()
# 训练模型
lr.fit(x,y)
# 计算R平方
print (lr.score(x,y))
# 计算y_hat
y_hat = lr.predict(x)
# 打印出图
plt.scatter(x,y)
plt.plot(x, y_hat)
plt.show()
回归算法
案例:波士顿房价预测
from sklearn.datasets import load_boston #导入波士顿房价数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
import seaborn as sns
house = load_boston() #数据导入
# print(house) #具体数据
# print(house.DESCR) #数据集的描述信息,健值对中有DESCR
x = house.data
y = house.target
df = pd.DataFrame(x,columns=house.feature_names) #feature_names也是健值对
df['Target']=pd.DataFrame(y,columns=['Target'])
# print(df.head()) #默认显示前五行
plt.figure(figsize=(15,15)) #设置图片大小等
p = sns.heatmap(df.corr(),annot=True,square=True) #热力图,数值为两个变量之间的相关系数 df.corr()相关系数
plt.show() #必须得有,不然图片只出现一瞬间
正数:正相关
(二维热力图:分析两个特征之间的相关性)
(数据标准化)
from sklearn.preprocessing import StandardScaler #数据标准化
#数据标准化
ss = StandardScaler()
x = ss.fit_transform(x) #把x标准化
(切分数据集以及该模型训练与评估)
from sklearn.model_selection import train_test_split #切分数据集
#切分数据集
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3) #测试集占比30%
# 创建模型
model = LassoCV()
model.fit(x_train,y_train)#将训练集传入模型,训练
score = model.score(x_test,y_test)#评估模型对测试集的效果,分数越高越好
print(score)
案例:葡萄酒质量和时间关系线性回归
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#载入数据
data = np.genfromtxt('linear.csv',delimiter=',') # csv文件,所以用','分隔符
plt.scatter(data[1:,0],data[1:,1]) # data[行,列]索引
# plt.title('Age vs Quality')
# plt.xlabel('age')
# plt.ylabel('Quality')
# plt.show()
#数据拆分
xTrain,xTest,yTrain,yTest = train_test_split(data[1:,0],data[1:,1],test_size=0.3)
#增加维度,函数要求传入2维数据
xTrain = xTrain[:,np.newaxis] #在列方向增加一个维度 即n维1列
xTest = xTest[:,np.newaxis] #在列方向增加一个维度 即n维1列
#训练模型
model = LinearRegression()
model.fit(xTrain,yTrain)
#训练集散点图
plt.scatter(xTrain,yTrain,color='y')
plt.scatter(xTest,yTest,color='b')
#模型对训练集的预测结果
plt.plot(xTest,model.predict(xTest),color='r',linewidth=5)
plt.title('Age vs Quality(Training set)')
plt.xlabel('age')
plt.ylabel('Quality')
plt.show()
逻辑回归
案例:用户流失预测
(将字符串的信息用数字编码替换成数字类型的字符串)
(将上步字符串类型的数字转换为可用与参与数学计算的数字类型)
(删掉不相关的数据以及提取需要的数据)
(回归模型检验)
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report #用于评估模型
trainData = np.genfromtxt('Churn-Modelling.csv',delimiter=',',dtype=np.str) #当数据中有字符串的时候,最后一个参数要用str
testData = np.genfromtxt('Churn-Modelling-Test-Data.csv',delimiter=',',dtype=np.str)
#取出需要的数据
xTrain = trainData[1:,:-1]#第一行到最后一行(最开始是第0行),从第0列到最后一列,但不包括最后一列
yTrain = trainData[1:,-1]#取最后一列
xTest = testData[1:,:-1]
yTest = testData[1:,-1]
#删掉与用户流失不相关的数据列
xTrain = np.delete(xTrain,[0,1,2],axis=1) #删掉第012列
xTest = np.delete(xTest,[0,1,2],axis=1)
#将字符串的数据 用数字编码 替换成数字类型
labelEncoder1 = LabelEncoder()
xTrain[:,1] = labelEncoder1.fit_transform(xTrain[:,1]) #把第一列国家名用数字替换
xTest[:,1] = labelEncoder1.transform(xTest[:,1])
labelEncoder2 = LabelEncoder()
xTrain[:,2] = labelEncoder2.fit_transform(xTrain[:,2])
xTest[:,2] = labelEncoder2.transform(xTest[:,2])
#将字符串类型的数字转换为数字型的数字
xTrain = xTrain.astype(np.float32)
xTest = xTest.astype(np.float32)
yTrain = yTrain.astype(np.float32)
yTest = yTest.astype(np.float32)
#数据标准化
sc = StandardScaler()
xTrain = sc.fit_transform(xTrain)
xTest = sc.transform(xTest)
#逻辑回归与模型检验
lr = LogisticRegression() #定义回归模型
lr.fit(xTrain,yTrain)
prediction = lr.predict(xTest) #用测试集得出预测值
print(classification_report(yTest,prediction)) #评估模型预测值与测试值
糖尿病预测项目
(查看数据信息,判断是否有空缺值)
(查看数据数目、平均值、最小值、标准差等)
(数据维数 几行几列)
(查看比如某列标签下 0有多少个 1有多少个,并用柱状图画出)
(可视化数据分布)
(处理异常值,比如年龄为0 葡萄糖为0的这些值 )
(链接: link.)
(删除空值较多的标签 填充空值较少的标签值)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno # 显示空值
from sklearn.impute import SimpleImputer # 插补库
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
data = pd.read_csv('diabetes.csv')
data.info(verbose=True) # 数据信息 可以看是否有空缺值
data.describe() # 查看数据标准差、平均值等
data.shape # 数据维数
print(data.Outcome.value_counts()) # 查看比如某列标签下 0有多少个 1有多少个 outcome 是数据集中某一列的标签
p = data.Outcome.value_counts().plot(kind="bar")
plt.show()
# 可视化数据分布
p = sns.pairplot(data,hue='Outcome') # 按照Outcome进行分类
plt.show()
# 用空值替换掉年龄为0的异常数据
colume = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
data[colume] = data[colume].replace(0,np.nan)
# 柱状图显示数据的空值
p = msno.bar(data)
plt.show()
# 删除空值较多的数据列
threshData = data.shape[0]*0.8 # 设置阈值,行数乘0.8 即数据的百分之80
data = data.dropna(thresh=threshData,axis=1) # 某列数据丢失超过20%,删除该列数据
p = msno.bar(data)
plt.show()
imr = SimpleImputer(missing_values=np.nan,strategy="mean") # 均值插补
colume = ['Glucose','BloodPressure','BMI']
data[colume] = imr.fit_transform(data[colume])
p = msno.bar(data)
plt.show()
# 热力图相关系数分析
plt.figure(figsize=(12,10))
p = sns.heatmap(data.corr(),annot=True)
plt.show()
# 切分数据集,比如把是否患有糖尿病的标签Outcome设为y,其余设为x
x = data.drop("Outcome",axis=1) # 把Outcome这一列去除掉,其余的赋给x
y = data.Outcome
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,stratify=y) # stratify=y 表示前后分割 对 y 的比例保持一致
# 模型
lr = LogisticRegression()
lr.fit(x_train,y_train)
prediction = lr.predict(x_test)
print(classification_report(y_test,prediction))
神经网络-手写数字识别
如果找不到神经网络的包,就要更新sklearn
from sklearn.neural_network import MLPClassifier # 如果出错,要更新sklearn
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
digits = load_digits()
x_data = digits.data
y_data = digits.target
# 标准化
scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.3)
# 模型搭建
mlp = MLPClassifier(hidden_layer_sizes=(100,50),max_iter=500) # 两个隐藏层,一个包含100个神经元,一个包含50个神经元;迭代500次
mlp.fit(x_train,y_train)
prediction = mlp.predict(x_test)
print(classification_report(y_test,prediction))
决策树
叶子分类
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
train = pd.read_csv('train.csv')
print(train.head())
print(len(train.species.unique())) # .unique 方法:统计不同类别的个数
# 把字符串转化为数字形式
lb = LabelEncoder().fit(train.species)
labels = lb.transform(train.species)
#去掉列
data = train.drop(['species','id'],axis=1)
print(data.head())
x_train,x_test,y_train,y_test = train_test_split(data,labels,test_size=0.3,stratify=labels)
# 建模
tree = DecisionTreeClassifier()
tree.fit(x_train,y_train)
print(tree.score(x_test,y_test))
# 如果测试集的分数不够高,可以优化模型
param_grid = {'max_depth':[30,40,50,60,70],
'min_samples_split':[2,3,4,5,6],
'min_samples_leaf':[1,2,3,4]}
model = GridSearchCV(DecisionTreeClassifier(),param_grid,cv=3)
model.fit(x_train,y_train)
print(model.best_estimator_)
print(model.score(x_test,y_test))
更多推荐
已为社区贡献1条内容
所有评论(0)