python广深地区房价数据的爬取与分析

本项目收集了广东省二手房数据，着重分析广深地区的房价。首先采用统计分析的方法对数据进行初步分析，大致了解房价分布及其影响因素；随后调用百度地图API，实现数据地图可视化。最后采用机器学习方法建模预测，并比较了几种常用回归模型的预测效果。基本符合一个完整数据分析案例的要求，采用直观的数据可视化方式展示数据，并通过数据分析为二手房购买者提供建设性意见。但仍有很多不足的地方，如并没有对数据进行特征工程，

LGDDDDDD

23591人浏览 · 2019-08-05 23:01:12

LGDDDDDD · 2019-08-05 23:01:12 发布

参考请标出处

1、数据爬取

房天下的网站，用最便捷的requesets+xpath定位爬取。
由于房地产市场有一定的饱和，新房的数据量太小，因此选择二手房的数据

import requests
import re
from lxml import etree
import csv
import time

fp=open('E:\ fangtianxia.csv','wt',newline='',encoding='utf-8')
writer=csv.writer(fp)
writer.writerow(('city','name','loc','size','area','price','price_sum','dire','floor','buildtime','advantage'))
headers = {
        'Connection': 'close',
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        "accept-encoding": "gzip, deflate, br",
        "cache-control": "no-cache",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "no-cache",
        "cookie" : "global_cookie=cvgwqloe7oksvtftupwmtsn1o20jztnjsd5; city=sz; Integrateactivity=notincludemc; integratecover=1; SKHRecordssz=%252c%25e5%25b1%2585%25e5%25ae%25b6%25e4%25b8%2589%25e6%2588%25bf%252c%25e7%2589%25a9%25e4%25b8%259a%252c%25e4%25b8%259a%25e4%25b8%25bb%25e8%25af%259a%25e5%25bf%2583%25e5%2587%25ba%25e5%2594%25ae%257c%255e2019%252f8%252f27%2b19%253a56%253a33%257c%255e0%257c%2523%25e5%25a4%25a7%25e8%25bf%2590%25e6%2596%25b0%25e5%259f%258e%2b%25e5%258e%2585%25e5%2587%25ba%25e9%2598%25b3%25e5%258f%25b0%2b%25e7%25b2%25be%25e8%25a3%2585%25e4%25b8%2589%25e6%2588%25bf%2b%25e6%25bb%25a1%25e4%25b8%25a4%25e5%25b9%25b4%257c%255e2019%252f8%252f27%2b19%253a56%253a41%257c%255e0; __utma=147393320.1831537449.1566899575.1566905739.1566993019.4; __utmz=147393320.1566993019.4.4.utmcsr=search.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/captcha-c342d934c8/; g_sourcepage=ehlist; __utmc=147393320; logGuid=a4782b6a-96fe-4bbf-90e4-395577d22851; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmb=147393320.18.10.1566993019; unique_cookie=U_klome40gpefgacg4y0p3st5ko1sjzv86iuc*6",
        "pragma": "no-cache",
        "referer": "https://sz.esf.fang.com/",
        "sec - fetch - mode": "navigate",
        "sec - fetch - site" : "none",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests" : "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
    }
city_list=[]

def get_info(city_url):
    i=re.search('house/i3(.*?)/',city_url).group(1)
    city_name=re.search('//(.*?).esf',city_url).group(1)
    print('正爬取{}第{}页'.format(city_name,i))
    requests.packages.urllib3.disable_warnings()
    response=requests.get(city_url,headers=headers,timeout=None,verify=False)
    selector=etree.HTML(response.text)
    infos = selector.xpath('//dl[@dataflag="bg"]')
    try:
        for info in infos:
            name = info.xpath('dd/p[2]/a/@title')
            name = name[0] if len(name) != 0 else ' '
            loc = info.xpath('dd/p[2]/span/text()')[0]
            size = info.xpath('dd/p/text()[1]')[0].strip()
            area = info.xpath('dd/p/text()[2]')[0].strip()[:-2]
            dire = info.xpath('dd/p/text()[4]')[0].strip()
            floor = info.xpath('dd/p/text()[3]')[0].strip()
            buildtime = info.xpath('dd/p/text()[5]')
            buildtime = buildtime[0].strip() if len(buildtime) != 0 else '未知'
            price = info.xpath('dd[2]/span[2]/text()')[0].strip()[:-4]
            pricesum = info.xpath('dd[2]/span/b/text()')[0].strip()
            advantage = info.xpath('dd/p[3]')
            advantage = advantage[0].xpath('string(.)').strip()#获取连续多个标签的文本
            advantage = advantage if len(advantage) != 0 else '无'
            print(city_name,name,loc,size,area,dire,floor,buildtime,price,pricesum,advantage)
            writer.writerow((city_name,name, loc, size, area, price, pricesum, dire, floor, buildtime, advantage))
    except IndexError:
        pass

if __name__=='__main__':
    city_name = ['sz','gz','zh','shaoguan','st','fs','zj', 'zhaoqing', 'jm', 'maoming','huizhou', 'meizhou',
                     'shanwei', 'heyuan', 'yangjiang', 'qingyuan', 'dg','zs', 'chaozhou', 'jieyang', 'yunfu']
    urls = ['https://{}.esf.fang.com'.format(city) for city in city_name]
    print(urls)
    try:
        for url in urls:
            response = requests.get(url,headers=headers,timeout=None)
            page = re.findall('<p>共(.*?)页</p>', response.text)[0]
            print(page)
            city_urls = [url +'/house/i3' + str(i) + '/' for i in range(1, int(page) + 1)]
            print(city_urls)
            for city_url in city_urls:
                city_list.append(city_url)

    except IndexError:
        pass


    for city_ in city_list:
        try:
            get_info(city_)
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            time.sleep(5)
            print("now let me continue...")
            continue

fp.close()

2、明确需求与目的

当今时代，房价问题一直处于风口浪尖，房价的上涨抑或下跌都牵动着整个社会的利益，即便是政府出台各种政策方针也只能是暂时抑制楼市的涨势，对于需要买房的人来说，除了关注这些变化和政策外，还有一个非常头疼的问题，在哪里买房，房价怎样。一般的人会不停花大量精力逛链家、安居客等房地产网站，借助他们展示的内容进行筛选，但因地区众多，各个地段、房价差异的对比以及入手时机的把握，都得自己去一个个查阅与分析，非常麻烦。倘若可以通过数据的爬取，再按照自己希望的维度统计、分析与展示，会让数据变得清晰明了。本项目旨在提取并展示数据，为刚需购房者提供有用信息。

数据预览

提出问题

1、广东省房价的总体情况如何？
2、高端小区都有哪些？
3、广东省小区的命名偏好
4、广深两地的房源分布如何
5、广深房价与房屋面积大小的关系如何？
6、广深地区房源分布的地铁线以及房价与距地铁线距离的关系
7、广深地区房屋朝向
8、广深地区建设年份集中情况
9、广深地区热门户型

3.数据预处理

第一步导入相关的库，并做相关设置

import os
os.chdir('H:\\ana\data')#切换到指定路径
import numpy as np
import pandas as pd
from pyecharts import Map,Bar,WordCloud,Pie
import matplotlib.pyplot as plt
import re 
import seaborn as sns
from scipy import stats
plt.style.use('ggplot')
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']  #解决seaborn中文字体显示问题
plt.rc('figure', figsize=(10, 10))  #把plt默认的图片size调大一点
plt.rcParams["figure.dpi"] =mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
%matplotlib inline

第二步加载数据集

data=pd.read_excel('房产数据.xlsx')
print(data.shape)#返回数据行列数
# data.head()#查看开头指定列数
# data.tail()#查看末尾指定列数
data.sample(10)#随机取指定列数

在这里插入图片描述

数据清洗

缺失值

通过info查看数据信息。
也可以通过isnull与sum结合，查看缺失值情况。

data.info()

在这里插入图片描述

data.isnull().sum()

在这里插入图片描述
缺失值占总数据的％10左右，可以直接删去。但达到％30左右及以上，我们可以采用填充的方法，均值中值或者众数来填充视情况而定。

# 删除所有含有空值的行。就地修改。
data.dropna(axis=0, inplace=True)
data.isnull().sum()

异常值(对连续性标签做处理）

通过describe查看数值信息。
可配合箱线图辅助。
异常值可以删除，视为缺失值，或者不处理。

data.describe()

在这里插入图片描述

sns.boxplot(data=data['price'])
sns.boxplot(data=data['area'])

在这里插入图片描述

箱线图包括最小值，四分之一位点q1，中位点，四分三位点q3，最大值，离群点。
离群点定义为小于q1 - 1.5IQR,大于q3 + 1.5IQR(q3-q1=IQR)。
离群点可能为异常值，但就此看这些离群点都是算在一个合理的范围内的。

第一张图为价格的箱线图，离群点很多，说明广东省存在个别市的房价差异巨大，但大多数处于较低的水平。这一现象符合认知，珠三角城市房价大于其他地级市房价。
第二张图为房屋面积的箱线图，也存在不少的离群点，面积位于100-200平米的房屋占绝大多数，存在特别大面积的房屋有可能是集体宿舍，经浏览网页发现也有可能存在商家乱填刷单的现象，属于异常值，需要后续清洗

#清洗面积和价格的异常数据，主观选取一个较合理范围
data=data.drop(data[data['area']>300].index)
data=data.drop(data[data['price']>200000].index)

异常值(对离散标签做处理）

1、朝向

data['dire'].unique()

在这里插入图片描述
由于极其少数网站页面没有朝向的数据，而是把后面建成时间的数据提前，因此有些错乱。
但无妨，我们只需做个简单的过滤，保留有正确朝向的数据

index=data[~data['dire'].isin(['东北向', '北向', '南向', '东向', '西南向', '南北向', '东南向', '西北向', '西向', '东西向'])].index
data=data.drop(index)

2、建成时间

data['buildtime'].unique()

在这里插入图片描述
发现存在几项异常的时间，有可能是地产商预计的建成时间，我们不考虑未建成的房屋，因此做一个过滤

index=data[data['buildtime'].isin(['2020年建','2021年建', '2022年建', '2025年建'])].index
data=data.drop(index)

3、楼层

data['floor'].unique()

在这里插入图片描述
这一项数据非常凌乱，而且存在有一些极其异常的数据，我们做一个过滤

#清洗个别严重异常数据
index=data[data['floor'].isin(['低层（共302层）','低层（共215层）','低层（共130层）', '低层（共220层）','低层（共142层）'])].index
data=data.drop(index)

4、房屋布局

data['size'].unique()

在这里插入图片描述
室和厅数量比较多的查看后发现均为集体宿舍或大型别墅，在一个比较合理的范围

index=data[data['size'].isin([ '0室0厅'])].index
data=data.drop(index)

最后将清洗后的数据保存，把广州深圳的数据另外保存，下面会着重分析

#保存清洗后的数据
data.to_csv('data_clean.csv',index=False)
data.loc[data.city=='深圳',:].to_csv('shenzhen_data_clean.csv',index=False)
data.loc[data.city=='广州',:].to_csv('guangzhou_data_clean.csv',index=False)

4、数据分析

问题1、广东省房价的总体情况如何？

g = data.groupby("city")#按城市分组
r = g["price"].agg(["mean", "count"])#按价格的均值、数量创建对象
r = r.sort_values("mean", ascending=False)#按均值的降序排列
display(r)
r.plot(kind="bar")

在这里插入图片描述
很明显发现，珠三角城市房价位居前列，粤东粤西的边缘城市靠后，其中广州深圳的房价显著超出平均水平。数据量上珠三角城市也明显占优，说明大城市的房地产市场更加火爆。而小城市中阳江的数据量也比较大，个人认为应该是当地海陵岛的旅游业比较火爆，带动房产市场。

接下来绘制一个价格地图，更加直观的展示数据

city_sum=r.index.tolist()
for i in range(len(city_sum)):
    city_sum[i]+='市'#map的调用需要城市全称
price_avg=r['mean'].tolist()
map=Map('广东省各地级市平均房价','单位：元/平方米',
        title_color="#fff",title_pos="center",
        width=1200,  height=600,
        background_color='#404a59')
map.add("", city_sum, price_avg,
        maptype='广东',visual_range=[7000,35000],
        is_visualmap=True, visual_text_color='#000',
        is_label_show=True)

在这里插入图片描述

问题2、高端小区都有哪些？

我们发现就算在珠三角地区中，城市内的各个楼盘也存在很大的价格差异。
定义房价大于10万的小区为高端小区，作一个展示

upscale_community=pd.DataFrame()#创建一个新的DF对象
city_sum=r.index.tolist()
for city in city_sum:
    data_city=data.loc[data.city==city,:]#选取特定的城市
    data_city=data_city.loc[data_city.price>100000,:]#选取房价>十万的数据
    upscale_community=pd.concat([upscale_community,data_city],axis=0,ignore_index=True)#合并符合条件的数据集

upscale_community=upscale_community.loc[:,'name'].drop_duplicates()#去掉重复值
name=upscale_community.tolist()
value=[1 for i in range(len(name))]
wordcloud=WordCloud(width=1500, height=800)
wordcloud.add('', name, value, word_size_range=[10,20])

在这里插入图片描述

问题3、广东省小区的命名偏好

import jieba
import jieba.analyse
#载入数据
rows=pd.read_csv('data_clean.csv', header=0,encoding='utf-8',dtype=str)

segments = []
for index, row in rows.iterrows():
    content = row[1]	#提出小区名字的数据
    #TextRank 关键词抽取，只获取固定词性
    words = jieba.analyse.textrank(content, topK=50,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
    for word in words:
        # 记录全局分词
        segments.append({'word':word, 'count':1})
        
dfSg = pd.DataFrame(segments)

# 词频统计
dfWord = dfSg.groupby('word')['count'].sum()
dfWord.sort_values(ascending=False)[:30]	#取前30输出

输出结果如下
在这里插入图片描述
从上看出广东省的小区偏好以花园、广场、时代、国际等词命名。同时也看出房地产商保利在广东省占据一定的市场份额

问题4、广深两地的房源分布如何

想要把房源的分布在地图上展示出来，需要经纬度的数据，这里调用百度地图的API，把已有的地址数据转换为经纬度数据

import json
from urllib.request import urlopen, quote
import requests,csv
import pandas as pd 
def getlnglat(address):
    url = 'http://api.map.baidu.com/geocoder/v2/'
    output = 'json'
    ak = '###############'#这里输出你个人在百度地图注册的KEY
    add = quote(address) #由于本文城市变量为中文，为防止乱码，先用quote进行编码
    uri = url + '?' + 'address=' + add  + '&output=' + output + '&ak=' + ak
    req = urlopen(uri)
    res = req.read().decode() #将其他编码的字符串解码成unicode
    temp = json.loads(res) #对json数据进行解析
    return temp

# file = open(r'H:\ana\point.json','w') #建立json数据文件
try:
    data=pd.read_csv(r'H:\ana\data\guangzhou_data_clean.csv')#以广州市为例子
    for i in range(len(data)):
        loc='广州市'+data.loc[i,'loc']
        price=data.loc[i,'price']
        lng = getlnglat(loc)['result']['location']['lng'] #采用构造的函数来获取经度
        data.loc[i,'lng']=lng
        lat = getlnglat(b)['result']['location']['lat'] #获取纬度
        data.loc[i,'lat']=lat
        data.loc[i,'point']=str(lng)+','+str(lat)
        str_temp = '{"lat":' + str(lat) + ',"lng":' + str(lng) + ',"count":' + str(c) +'},'
        print(str_temp,i) #也可以通过打印出来，把数据copy到百度热力地图api的相应位置上
                
except KeyError:
    pass
    
data.to_csv('gz_latlon.csv',index=False)

获取经纬度后，在百度地图平台上上传相关带有经纬度的数据即可制作相关的热力地图
结果如下：
深圳
在这里插入图片描述

广州

可以发现，深圳的房源分布较为均匀，大多集中在南山区和福田区。图示中点越大代表价格越高，发现深圳湾周边和福田区中心的位置房价相对高很多。
广州房源的分布集中在白云区和天河区，也有一些小的集群点分布在广州北站、广州东站等交通枢纽附近，而广州南站较为偏僻，比较少房子分布

问题5、广深房价与房屋面积大小的关系如何？

def area_price_relation(city):
    data=pd.read_csv('{}_data_clean.csv'.format(city))
    g=sns.jointplot(x='area',
                   y='price',
                   data=data, 
                   kind='reg' ,
                   stat_func=stats.pearsonr
                   )
    g.fig.set_dpi(100)
    g.ax_joint.set_xlabel('面积', fontweight='bold')
    g.ax_joint.set_ylabel('价格', fontweight='bold')
    return g

area_price_relation('shenzhen')

在这里插入图片描述

area_price_relation('guangzhou')

在这里插入图片描述
可见价格与面积之间有一定的正相关关系。深圳中面积的影响更大，说明深圳的房价受波动更大，房价的不稳定性更大。

问题6、广深地区房源分布的地铁线以及房价与距地铁线距离的关系

def get_distance(city,data=data):
    station=[]#站
    distance=[]#距离
    station_count=[]#地铁线房源分布数量
    station_name=[]#地铁线
    data1=data.loc[data.city==city,:]
    data1=data1.reset_index(drop=True)#重置索引
    for i in range(len(data1)):
        s=re.findall('\d+',data1.loc[i,'advantage'])#用正则表达式匹配advantage标签
        if len(s)==2:
            distance.append(s[1])#距离
            station.append(s[0])#站线
            data1.loc[i,'distance']=s[1]
    data1.to_csv('{}_distance.csv'.format(city),index=False)    #重新保存数据，后续进行分析
    count=list(set(station))#列表去掉重复值的方法
    count.sort()#列表排序
    for i in count:
        station_count.append( station.count('{}'.format(i)) )  #统计各个站线房源分布数量
        station_name.append('{}号线'.format(i))  #相应站线            
    bar=Bar('')
    bar.add('' , station_name , station_count ,
            is_label_show=True , is_more_utils = True)
    return bar

get_distance('深圳')

在这里插入图片描述

get_distance('广州')

在这里插入图片描述

def distance_price_relation(city_short):
    data=pd.read_csv('{}_distance.csv'.format(city_short))
    g=sns.jointplot(x='distance',
                   y='price',
                   data=data.dropna(subset=['distance']),
                   kind='reg',
                    stat_func=stats.pearsonr
                   )
    g.fig.set_dpi(100)
    g.ax_joint.set_xlabel('最近地铁距离',fontweight='bold')
    g.ax_joint.set_ylabel('价格',fontweight='bold')
    return g

distance_price_relation('sz')

在这里插入图片描述

distance_price_relation('gz')

在这里插入图片描述
从上可见，深圳房子多分布在1、2、3号线，广州多分布在2、3、5、6号线。
房价与距离地铁站的距离有一定的负相关关系，距离越近，房价越高的趋势大。

问题7、广深地区房屋朝向

def dire_pie(city_short):
    data=pd.read_csv('{}_distance.csv'.format(city_short))
    dire=data.dire.value_counts().index.tolist()
    count=data.dire.value_counts().values.tolist()
    pie=Pie('朝向统计饼状图',title_pos='center')
    pie.add('饼图', dire, count, is_label_show=True,
            legend_orient='vertical', legend_pos='left',
            is_more_utils=True)
    return pie

dire_pie('sz')

在这里插入图片描述

dire_pie('gz')

在这里插入图片描述
很明显发现朝南的房子占％50以上。说明很多房地产商会偏向于建筑朝南的房子，以吸引顾客
朝南的房子有其优点：
1、由于我国位于北半球，大部分时间阳光从南方照射过来，而居住南面则房屋采光良好；
2、夏天时，强烈的下午阳光会偏向北方，南面的房屋可以避免下午阳光造成的高温；
3、冬天时，阳光会偏向与南面房屋，使得房屋在寒冷季节可以保持比较温暖。

问题8、广深地区建设年份集中情况

def time_pie(city):
    time=data[data.city==city].buildtime.value_counts().index.tolist()[:5]
    count=data[data.city==city].buildtime.value_counts().values.tolist()[:5]
    pie=Pie('建年统计饼状图',title_pos='center')
    pie.add('饼图', time, count, is_label_show=True,
            legend_orient='vertical', legend_pos='left',
            is_more_utils=True)
    
    return pie

time_pie('深圳')

在这里插入图片描述

time_pie('广州')

在这里插入图片描述
从上可发现，广深地区的房子集中在2014和2015两年，一定程度上说明这两年是房地产业迅猛发展的两年。同时2018年在前列，一定程度上说明广深地区流动人口占有较大的比重，房屋商品化，二手房的交易市场较热。也有大量年代较远的房子在售，说明这些老房子有一定的市场。

问题9、广深地区热门户型

def size(n,data=data):
    size_count=data[data.city==n]['size'].value_counts().values[:5]
    size_kind=data[data.city==n]['size'].value_counts().index[:5]
    bar=Bar('户型排行')
    bar.add('',size_kind,size_count,is_label_show=True,is_more_utils = True)
    return bar

size('深圳')

在这里插入图片描述

size('广州')

在这里插入图片描述
从上得出，广深地区热门户型非常一致，其中最热门为3室2厅

5、机器学习预测房价

采用机器学习算法综合考虑多个因素对房价的影响，建立预测模型。
首先要讲数据转换为可以作为模型输入的矩阵形式

sz_data=pd.read_csv('sz_distance.csv')
gz_data=pd.read_csv('gz_distance.csv')
def transform(data):
    for i in range(len(data)):
        words=list(jieba.cut(data.loc[i,'advantage']))
        if '满二' in words:
            data.loc[i,'exemption of business tax']=1
        else:
            data.loc[i,'exemption of business tax']=0
        if '满五' in words:
            data.loc[i,'exemption of double tax']=1
        else:
            data.loc[i,'exemption of double tax']=0
        if '教育' in words:
            data.loc[i,'quality education']=1
        else:
            data.loc[i,'quality education']=0
            
transform(sz_data)
transform(gz_data)

观察数据，发现房屋优势特征中满二、满五、优质教育的字段很多，因此单独转换为0和1，作为输入。

new_data=pd.DataFrame()
def datatrans(new_data,data,dire_sum=list(gz_data['dire'].unique())):
    new_data['city']=data['city']
    new_data['area']=data['area']
    new_data['buildtime']=data['buildtime'].astype('float')
    new_data['distance']=data['distance']
    for i in range(len(data)):
        s=re.findall('\d+',data.loc[i,'size'])
        new_data.loc[i,'room_num']=float(s[0])
        new_data.loc[i,'hall_num']=float(s[1])
        
        if '低层' in data.loc[i,'floor']:
            new_data.loc[i,'floor']=1
        elif '中层' in data.loc[i,'floor']:
            new_data.loc[i,'floor']=2
        elif '高层' in data.loc[i,'floor']:
            new_data.loc[i,'floor']=3
            
        dire=data.loc[i,'dire']
        idx=dire_sum.index(dire)+1
        new_data.loc[i,'dire']=idx
        
    new_data['exemption of business tax']=data['exemption of business tax']
    new_data['exemption of double tax']=data['exemption of double tax']
    new_data['quality education']=data['quality education']

datatrans(new_data,sz_data)
new_data1=pd.DataFrame()
datatrans(new_data1,gz_data)
new_data1=pd.concat([new_data1,new_data],axis=0,ignore_index=True)

进一步处理数据，将楼层按照低中高分别赋值1、2、3作为输入。
再用正则表达式将房屋布局的数据拆分为房间数量和客厅数量两个特征输入。
将各个不同朝向的数据转化为1-8作为输入

gz_price = gz_data['price']
sz_price = sz_data['price']
price = pd.concat([gz_price,sz_price],axis=0,ignore_index=True)
new_data1 = new_data1.join(pd.get_dummies(new_data1.city))
new_data1.drop('city',axis=1,inplace=True)
new_data1.to_csv('new_data7.20.csv',index=False)

当前数据有11个特征（房屋面积、建成时间、距地铁站距离、房间数、客厅数、楼层、方向、是否满二、是否满五、是否优质教育、城市）和1个标记（房价）。因为预测目标——房价是一个连续变量，因此本项目中的价格预测是一个回归问题。

数据预处理

data=pd.read_csv('new_data7.20.csv')
data['distance'].fillna(5000,inplace=True)
data['buildtime'].fillna(data['buildtime'].mode()[0],inplace=True)
X = data.drop(["price"], axis=1)

#数据分割，随机采样25%作为测试样本，其余作为训练样本
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)


#数据标准化处理 归一化
from sklearn.preprocessing import StandardScaler
ss_x = StandardScaler()
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)

线性回归

from sklearn.linear_model import LinearRegression
lr = LinearRegression()     #初始化
lr.fit(x_train, y_train)    #训练数据
lr_y_predict = lr.predict(x_test)   #回归预测
#性能测评：使用R方得分指标对模型预测结果进行评价
from sklearn.metrics import  r2_score
print("LinearRegression模型的R方得分为：", r2_score(y_test, lr_y_predict))

plt.figure(figsize=(15, 5))
plt.plot(y_test.values[:100], "-r", label="真实值")
plt.plot(lr_y_predict[:100], "-g", label="预测值")
plt.legend()
plt.title("线性回归预测结果")

在这里插入图片描述

KNN

param_grid = [
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1,12)]
        
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,12)],
        'p':[i for i in range(1,6)]
    }
]
from sklearn.neighbors import KNeighborsRegressor
knnrgr = KNeighborsRegressor()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knnrgr,param_grid)
grid_search.fit(x_train,y_train)

用网格搜索法寻找调参，训练结果为
在这里插入图片描述

其他的回归模型

models = [Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),
GradientBoostingRegressor(),SVR(),ElasticNet(alpha=0.001,max_iter=10000),
SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),ExtraTreesRegressor(),
XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=False, objective='reg:gamma')]
names = [ "邻回归", "Lasso回归", "随机森林", "梯度提升树", "支持向量机" , "弹性网络","梯度下降回归","贝叶斯线性回归","L2正则线性回归","极端随机森林回归","Xgboost回归"]
for name, model in zip(names, models):
    model.fit(x_train,y_train)
    predicted= model.predict(x_test)
    print("{}: {:.6f}, {:.4f}".format(name,model.score(x_test,y_test),mean_squared_error(y_test, predicted)))

结果如下
在这里插入图片描述

模型调参

class grid():
    def __init__(self,model):
        self.model = model
    
    def grid_get(self,X,y,param_grid):
        grid_search = GridSearchCV(self.model,param_grid,cv=5,n_jobs=-1)
        grid_search.fit(X,y)
        print(grid_search.best_params_, grid_search.best_score_)
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

Lasso回归调参

grid(Lasso()).grid_get(x_train,y_train,{'alpha': [0.0004,0.0005,0.0007,0.0006,0.0009,0.0008],'max_iter':[10000]})

在这里插入图片描述
岭回归调参

grid(Ridge()).grid_get(x_train,y_train,{'alpha':[35,40,45,50,55,60,65,70,80,90]})

在这里插入图片描述
核邻回归调参

param_grid={'alpha':[0.2,0.3,0.4,0.5], 'kernel':["polynomial"], 'degree':[3],'coef0':[0.8,1,1.2]}
grid(KernelRidge()).grid_get(x_train,y_train,param_grid)

在这里插入图片描述
弹性网络调参

grid(ElasticNet()).grid_get(x_train,y_train,{'alpha':[0.0005,0.0008,0.004,0.005],'l1_ratio':[0.08,0.1,0.3,0.5,0.7],'max_iter':[10000]})

在这里插入图片描述

模型加权平均集成

def r2(model,X,y):
    return cross_val_score(model, X, y, scoring="r2", cv=5)


class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,mod,weight):
        self.mod = mod
        self.weight = weight
        
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
        return self
    
    def predict(self,X):
        w = list()
        pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        for data in range(pred.shape[1]):
            single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
            w.append(np.sum(single))
        return w
        
lasso = Lasso(alpha=0.0009,max_iter=10000)
ridge = Ridge(alpha=35)
ker = KernelRidge(alpha=0.5 ,kernel='polynomial',degree=3 , coef0=0.8)
ela = ElasticNet(alpha=0.005,l1_ratio=0.3,max_iter=10000)
bay = BayesianRidge()

w1 = 0.15 #按R2指标赋权
w2 = 0.15
w3 = 0.4
w4 = 0.15
w5 = 0.15

weight_avg = AverageWeight(mod = [lasso,ridge,ker,ela,bay],weight=[w1,w2,w3,w4,w5])
r2(weight_avg,x_train,y_train)  
r2(weight_avg,x_train,y_train).mean()

在这里插入图片描述

模型融合

class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self,mod,meta_model):
        self.mod = mod
        self.meta_model = meta_model
        self.kf = KFold(n_splits=5, random_state=42, shuffle=True)
        
    def fit(self,X,y):
        self.saved_model = [list() for i in self.mod]
        oof_train = np.zeros((X.shape[0], len(self.mod)))
        
        for i,model in enumerate(self.mod):
            for train_index, val_index in self.kf.split(X,y):
                renew_model = clone(model)
                renew_model.fit(X[train_index], y[train_index])
                self.saved_model[i].append(renew_model)
                oof_train[val_index,i] = renew_model.predict(X[val_index])
        
        self.meta_model.fit(oof_train,y)
        return self
    
    def predict(self,X):
        whole_test = np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1) 
                                      for single_model in self.saved_model]) 
        return self.meta_model.predict(whole_test)
    
    def get_oof(self,X,y,test_X):
        oof = np.zeros((X.shape[0],len(self.mod)))
        test_single = np.zeros((test_X.shape[0],5))
        test_mean = np.zeros((test_X.shape[0],len(self.mod)))
        for i,model in enumerate(self.mod):
            for j, (train_index,val_index) in enumerate(self.kf.split(X,y)):
                clone_model = clone(model)
                clone_model.fit(X[train_index],y[train_index])
                oof[val_index,i] = clone_model.predict(X[val_index])
                test_single[:,j] = clone_model.predict(test_X)
            test_mean[:,i] = test_single.mean(axis=1)
        return oof, test_mean
a = Imputer().fit_transform(x_train)
b = Imputer().fit_transform(y_train.values.reshape(-1,1)).ravel()
stack_model = stacking(mod=[lasso,ridge,ker,ela,bay],meta_model=ker)

6、总结

本项目收集了广东省二手房数据，着重分析广深地区的房价。首先采用统计分析的方法对数据进行初步分析，大致了解房价分布及其影响因素；随后调用百度地图API，实现数据地图可视化。最后采用机器学习方法建模预测，并比较了几种常用回归模型的预测效果。
基本符合一个完整数据分析案例的要求，采用直观的数据可视化方式展示数据，并通过数据分析为二手房购买者提供建设性意见。但仍有很多不足的地方，如并没有对数据进行特征工程，没有进行特征的转换和筛选，机器学习模型的调参也比较简略，因此预测能力还有很大的提升空间。