题目背景

预测下客户是否会购买银行的产品,给出了训练集和测试集,具体如下:
天池【教学赛】金融数据分析赛题1:银行客户认购产品预测
1、首先进行数据预处理

  • 因为题目中的特征变量有str格式和float格式,需要将数据预处理,变为int类型
  • 另外,本数据集中特征变量包含离散和连续特征,因此针对离散特征要进行one_hot编码,有序离散变量一般进行label_hot编码
  • 这里数据集出现了unknown和nonexistent,在这里我依次对该离散变量进行数值化操作并且取平均值

1)针对客户的job、education,进行了背景评级
在这里插入图片描述

def background_combination(job,education):

    job_dic={'admin.':4,'blue-collar':3,'entrepreneur':6,
             'housemaid':3,'management':5,'retired':3,
             'self-employed':4,'services':2,'student':1,
             'technician':3,'unemployed':1,'unknown':3}

    education_dic={'basic.4y':1,'basic.6y':1,'basic.9y':2,
                   'high.school':3,'illiterate':0,'professional.course':4,
                   'university.degree':4,'unknown':2}
    job_score=job_dic[job]
    education_score=education_dic[education]
    background_score=job_score+education_score
    return background_score

2)针对客户的default、housing、loan,进行了经济水平评级
在这里插入图片描述

def economic_level(default,housing,loan):
    default_dic={'yes':1,'no':3,'unknown':2}
    housing_dic={'yes':3,'no':1,'unknown':2}
    loan_dic={'yes':1,'no':3,'unknown':2}
    economic_score=default_dic[default]+housing_dic[housing]+loan_dic[loan]
    return economic_score

3)然后针对marital、contact、poutcome进行one_hot编码

def one_hot_data_preparation(input):
    marital_dic={'divorced':1,'single':2,'married':3,'unknown':2}
    contact_dic={'cellular':1,'telephone':2}
    poutcome_dic={'failure':2,'nonexistent':1,'success':3}
    if input in marital_dic:
        return marital_dic[input]
    if input in contact_dic:
        return contact_dic[input]
    if input in poutcome_dic:
        return poutcome_dic[input]

# 整合数据,将数据变成Dataframe格式,
# 并且给需要one-hot编码的marital,contact和poutcome进行数值化

def data_transfer_process(dataset):
    background_score_list=[]
    economic_score_list=[]
    marital_list=[]
    contact_list=[]
    poutcome_list=[]
    label_list=[]
    for i in dataset.values:
        # print(i[0],i[1],type(i[0]))
        background_score=background_combination(i[0],i[1])
        economic_score=economic_level(i[2],i[3],i[4])
        marital_value=one_hot_data_preparation(i[5])
        contact_value=one_hot_data_preparation(i[6])
        poutcome_value=one_hot_data_preparation(i[7])

        background_score_list.append(background_score)
        economic_score_list.append(economic_score)
        marital_list.append(marital_value)
        contact_list.append(contact_value)
        poutcome_list.append(poutcome_value)

    # return background_score_list,economic_score_list,marital_list,contact_list,poutcome_list
    list=[background_score_list,economic_score_list,marital_list,contact_list,poutcome_list]
    list=transpose(list)
    name=['background','economic_level','marital','contact','poutcome']
    data=pd.DataFrame(columns=name,data=list)
    return data



def train_data_preparation():

    # data induction
    train_data = pd.DataFrame(pd.read_csv('./train.csv'))
    label=train_data[['subscribe']]

    label_list=[]
    train_data=train_data[['job','education',
                           'default','housing','loan',
                           'marital','contact','poutcome']]

    processed_train_data=data_transfer_process(train_data)

    for i in label.values:
        label_value=judge(i)
        label_list.append(label_value)
    label_name=['label']
    label_train=pd.DataFrame(columns=label_name,data=label_list)

    one_hot_marital=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['marital']])
    train_marital_list=['marital_1','marital_2','marital_3']
    one_hot_marital=pd.DataFrame(one_hot_marital).astype(int)
    one_hot_marital.columns=train_marital_list


    one_hot_contact=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['contact']])
    train_contact_list=['contact_1','contact_2']
    one_hot_contact=pd.DataFrame(one_hot_contact).astype(int)
    one_hot_contact.columns=train_contact_list

    one_hot_poutcome=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['poutcome']])
    train_poutcome_list=['poutcome_1','poutcome_2','poutcome_3']
    one_hot_poutcome=pd.DataFrame(one_hot_poutcome).astype(int)
    one_hot_poutcome.columns=train_poutcome_list


    train_data=processed_train_data.drop('marital',axis=1)
    train_data=processed_train_data.drop('contact',axis=1)
    train_data=processed_train_data.drop('poutcome',axis=1)

    frames=[processed_train_data,one_hot_marital,one_hot_contact,one_hot_poutcome,label_train]
    train_result=pd.concat(frames,axis=1)

    return train_result

2、MLP模型的编写

  • MLP.py
import numpy as np


class MLP:
    '''
    用于多分类的MLP
    '''
    def predict(self, feature, parameter_dict):
        feature = np.mat(feature)
        feature = np.mat(self.normalize(feature))
        re_list = []
        sample_num = feature.shape[0]
        for m in range(sample_num):
            current_sample = feature[m]
            for layer_index in range(len(parameter_dict.keys())):
                current_sample = np.insert(current_sample, 0, values=1, axis=1)
                # print(current_sample)
                # print("===================")
                # print(parameter_dict[layer_index + 1])
                current_sample = current_sample * parameter_dict[layer_index + 1]
                current_sample = self.sigmoid(current_sample)
            # print(current_sample)
            re_list.append(np.argmax(np.array(current_sample)))
        print("*****************")
        return re_list

    def train(self, feature, label, hidden, learning_rate, iteration_num):
        '''

        :param feature: 装有 m行 * n列 数据的特征矩阵,样本数为m,特征数为n
        :param label: 装有 m行 * 1列 标签的矩阵,样本数为m
        :param hidden: 装有隐藏层信息的字典,格式为{层数: 神经元个数},层数从1开始
        :param learning_rate: 学习率
        :param iteration_num: 梯度下降迭代次数
        :return: parameter_dict: 各层之间的参数矩阵
        '''

        feature = np.mat(feature)
        # feature = np.mat(self.normalize(feature))
        feature = np.mat(self.normalize(feature))
        label = np.mat(label)

        # 初始化参数矩阵
        feature_num = feature.shape[1]
        hidden_layer_num = len(hidden.keys())
        label_set = set()
        for i in np.array(label)[0]:
            label_set.add(i)
        label_categories_num = len(label_set)
        parameter_dict = {}
        parameter_dict[1] = np.mat(np.random.rand(feature_num + 1, int(hidden[1])))                          # 初始化输入层到隐藏层之间的参数矩阵
        if hidden_layer_num > 1:                                                                                    # 初始化隐藏层之间的参数矩阵
            for layer_index in range(1, hidden_layer_num):
                parameter_dict[layer_index+1] = np.mat(np.random.rand(hidden[layer_index] + 1, hidden[layer_index + 1]))
        parameter_dict[hidden_layer_num + 1] = np.mat(np.random.rand(hidden[hidden_layer_num] + 1, label_categories_num))  # 初始化最后一个隐藏层到输出层之间的参数矩阵

        # 初始化标签矩阵
        sample_num = feature.shape[0]
        label_matrix = np.mat(np.zeros((sample_num, label_categories_num)))
        for m in range(sample_num):
            label_matrix[m, label[0, m]] = 1

        # 返回训练出来每一层间的参数矩阵
        parameter_dict = self.gradient_descent(feature, label_matrix, parameter_dict, learning_rate, iteration_num)
        return parameter_dict

    # 梯度下降更新参数矩阵
    def gradient_descent(self, feature, label, parameter_dict, learning_rate, iteration_num):
        # 梯度下降更新参数矩阵
        for _ in range(iteration_num):
            sample_num = feature.shape[0]
            parameter_num = len(parameter_dict.keys())
            # 对每一个样本使用反向传播算法
            for m in range(sample_num):
                current_sample = feature[m]
                current_label = label[m]
                forward_input_value = {0: current_sample}
                activation_value = {0: current_sample}
                deviation = {}
                # 前向传播算每一层的前向输入值和激活输出值
                for layer_index_fp, parameter in parameter_dict.items():
                    activation_value[layer_index_fp - 1] = np.insert(activation_value[layer_index_fp - 1], 0, values=1, axis=1)  # 增加偏置项
                    forward_input_value[layer_index_fp] = activation_value[layer_index_fp - 1] * parameter_dict[layer_index_fp]
                    activation_value[layer_index_fp] = self.sigmoid(forward_input_value[layer_index_fp])
                # 反向传播求误差值
                deviation[parameter_num] = activation_value[parameter_num] - current_label  # 交叉熵损失函数下求输出层误差
                for layer_index_bp in range(parameter_num - 1, 0, -1):
                    # 前向输入增加偏置参数
                    forward_input_value[layer_index_bp] = np.insert(forward_input_value[layer_index_bp], 0, values=1, axis=1)
                    # 求隐藏层误差
                    ones = np.mat(np.ones((1, forward_input_value[layer_index_bp].shape[1])))
                    deviation[layer_index_bp] = np.multiply( (deviation[layer_index_bp + 1] * parameter_dict[layer_index_bp + 1].T), ( np.multiply( self.sigmoid(forward_input_value[layer_index_bp]), (ones - self.sigmoid(forward_input_value[layer_index_bp]))) ) )
                    # 误差去除偏置参数
                    deviation[layer_index_bp] = np.delete(deviation[layer_index_bp], 0, axis=1)
                # 更新参数
                for parameter_index in range(parameter_num, 0, -1):
                    parameter_dict[parameter_index] -= learning_rate * activation_value[parameter_index - 1].T * deviation[parameter_index]
        return parameter_dict

    # sigmoid函数
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    # 标准化
    def normalize(self, feature):
        feature_normalized = np.copy(feature).astype(float)
        feature_mean = np.mean(feature, 0)
        feature_deviation = np.std(feature, 0)
        if feature.shape[0] > 1:
            feature_normalized -= feature_mean
        feature_deviation[feature_deviation == 0] = 1
        feature_normalized /= feature_deviation
        return feature_normalized


3、模型训练及最终完整版代码

  • data_preparation.py
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder



'''
1、离散特征与连续特征的问题
   无序离散通常用onehot编码,对于有序离散通常用labelhot编码
2、unknown数据的处理: background与economic取平均值
'''

def judge(str):
    if str=='yes':
        result=0
    else:
        result=1
    return result

def transpose(matrix):
    new_matrix = []
    for i in range(len(matrix[0])):
        matrix1 = []
        for j in range(len(matrix)):
            matrix1.append(matrix[j][i])
        new_matrix.append(matrix1)
    return new_matrix

# 缺失数据过大
def drop_bias_data(dataset):
    dataset=dataset[~(dataset['job'].isin(['unknown'])|
                      dataset['marital'].isin(['unknown'])|
                      dataset['education'].isin(['unknown'])|
                      dataset['default'].isin(['unknown'])|
                      dataset['housing'].isin(['unknown'])|
                      dataset['loan'].isin(['unknown'])|
                      dataset['poutcome'].isin(['nonexistent']))]

    return dataset

def background_combination(job,education):

    job_dic={'admin.':4,'blue-collar':3,'entrepreneur':6,
             'housemaid':3,'management':5,'retired':3,
             'self-employed':4,'services':2,'student':1,
             'technician':3,'unemployed':1,'unknown':3}

    education_dic={'basic.4y':1,'basic.6y':1,'basic.9y':2,
                   'high.school':3,'illiterate':0,'professional.course':4,
                   'university.degree':4,'unknown':2}
    job_score=job_dic[job]
    education_score=education_dic[education]
    background_score=job_score+education_score
    return background_score

def economic_level(default,housing,loan):
    default_dic={'yes':1,'no':3,'unknown':2}
    housing_dic={'yes':3,'no':1,'unknown':2}
    loan_dic={'yes':1,'no':3,'unknown':2}
    economic_score=default_dic[default]+housing_dic[housing]+loan_dic[loan]
    return economic_score
def one_hot_data_preparation(input):
    marital_dic={'divorced':1,'single':2,'married':3,'unknown':2}
    contact_dic={'cellular':1,'telephone':2}
    poutcome_dic={'failure':2,'nonexistent':1,'success':3}
    if input in marital_dic:
        return marital_dic[input]
    if input in contact_dic:
        return contact_dic[input]
    if input in poutcome_dic:
        return poutcome_dic[input]

# 整合数据,将数据变成Dataframe格式,
# 并且给需要one-hot编码的marital,contact和poutcome进行数值化

def data_transfer_process(dataset):
    background_score_list=[]
    economic_score_list=[]
    marital_list=[]
    contact_list=[]
    poutcome_list=[]
    label_list=[]
    for i in dataset.values:
        # print(i[0],i[1],type(i[0]))
        background_score=background_combination(i[0],i[1])
        economic_score=economic_level(i[2],i[3],i[4])
        marital_value=one_hot_data_preparation(i[5])
        contact_value=one_hot_data_preparation(i[6])
        poutcome_value=one_hot_data_preparation(i[7])

        background_score_list.append(background_score)
        economic_score_list.append(economic_score)
        marital_list.append(marital_value)
        contact_list.append(contact_value)
        poutcome_list.append(poutcome_value)

    # return background_score_list,economic_score_list,marital_list,contact_list,poutcome_list
    list=[background_score_list,economic_score_list,marital_list,contact_list,poutcome_list]
    list=transpose(list)
    name=['background','economic_level','marital','contact','poutcome']
    data=pd.DataFrame(columns=name,data=list)
    return data



def train_data_preparation():

    # data induction
    train_data = pd.DataFrame(pd.read_csv('./train.csv'))
    label=train_data[['subscribe']]

    label_list=[]
    train_data=train_data[['job','education',
                           'default','housing','loan',
                           'marital','contact','poutcome']]

    processed_train_data=data_transfer_process(train_data)

    for i in label.values:
        label_value=judge(i)
        label_list.append(label_value)
    label_name=['label']
    label_train=pd.DataFrame(columns=label_name,data=label_list)

    one_hot_marital=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['marital']])
    train_marital_list=['marital_1','marital_2','marital_3']
    one_hot_marital=pd.DataFrame(one_hot_marital).astype(int)
    one_hot_marital.columns=train_marital_list


    one_hot_contact=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['contact']])
    train_contact_list=['contact_1','contact_2']
    one_hot_contact=pd.DataFrame(one_hot_contact).astype(int)
    one_hot_contact.columns=train_contact_list

    one_hot_poutcome=OneHotEncoder(sparse=False).fit_transform(processed_train_data[['poutcome']])
    train_poutcome_list=['poutcome_1','poutcome_2','poutcome_3']
    one_hot_poutcome=pd.DataFrame(one_hot_poutcome).astype(int)
    one_hot_poutcome.columns=train_poutcome_list


    train_data=processed_train_data.drop('marital',axis=1)
    train_data=processed_train_data.drop('contact',axis=1)
    train_data=processed_train_data.drop('poutcome',axis=1)

    frames=[processed_train_data,one_hot_marital,one_hot_contact,one_hot_poutcome,label_train]
    train_result=pd.concat(frames,axis=1)

    return train_result

def tes_data_preparation():

    # data induction
    test_data = pd.DataFrame(pd.read_csv('./test.csv'))
    test_data=test_data[['job','education',
                         'default','housing','loan',
                         'marital','contact','poutcome']]
    processed_test_data=data_transfer_process(test_data)

    one_hot_marital=OneHotEncoder(sparse=False).fit_transform(processed_test_data[['marital']])
    train_marital_list=['marital_1','marital_2','marital_3']
    one_hot_marital=pd.DataFrame(one_hot_marital).astype(int)
    one_hot_marital.columns=train_marital_list

    one_hot_contact=OneHotEncoder(sparse=False).fit_transform(processed_test_data[['contact']])
    train_contact_list=['contact_1','contact_2']
    one_hot_contact=pd.DataFrame(one_hot_contact).astype(int)
    one_hot_contact.columns=train_contact_list

    one_hot_poutcome=OneHotEncoder(sparse=False).fit_transform(processed_test_data[['poutcome']])
    train_poutcome_list=['poutcome_1','poutcome_2','poutcome_3']
    one_hot_poutcome=pd.DataFrame(one_hot_poutcome).astype(int)
    one_hot_poutcome.columns=train_poutcome_list


    test_data=processed_test_data.drop('marital',axis=1)
    test_data=processed_test_data.drop('contact',axis=1)
    test_data=processed_test_data.drop('poutcome',axis=1)
    frames=[processed_test_data,one_hot_marital,one_hot_contact,one_hot_poutcome]
    test_result=pd.concat(frames,axis=1)

    return test_result
  • train.py
import pandas as pd
from sklearn.preprocessing import Normalizer
from MLP import *
import matplotlib.pyplot as plt
import data_preparation



if __name__ == '__main__':

    # data测试
    train_data = data_preparation.train_data_preparation()
    test_data = data_preparation.tes_data_preparation()
    train_data = np.array(train_data)


    feature_train = train_data[:, 1:13]
    label_train = np.array(train_data[:, [13]].T)


    test_data = np.array(test_data)
    feature_test = test_data[:, 1:13]
    label_test=pd.read_csv('./submission.csv')
    label_test=np.array(label_test)
    test_label_list=[]

    for i in label_test[:,1]:
        label_value=data_preparation.judge(i)
        test_label_list.append(label_value)





    # 多层感知机
    MLP_test = MLP()
    parameter_dict = MLP_test.train(feature=feature_train, label=label_train, hidden={1: 5}, learning_rate=0.001, iteration_num=1)
    # print(parameter_dict)
    result1 = MLP_test.predict(feature_test, parameter_dict)
    # result2 = MLP_test.predict(feature_test2, parameter_dict)

    # print(result1)
    # print(result2)
    count = 0
    sum=0
    for i in range(len(result1)):
        sum=sum+abs(result1[i]-test_label_list[i])
    error_rate = sum / len(result1)
    accuracy_rate=1-error_rate

    print("error_rate:",error_rate)  # 用以上参数,测试中准确率约为85%
    print("accuracy_rate:",accuracy_rate)
Logo

为开发者提供学习成长、分享交流、生态实践、资源工具等服务,帮助开发者快速成长。

更多推荐