python 爬虫及数据可视化展示

学了有关python爬虫及数据可视化的知识,想着做一些总结,加强自己的学习成果,也能给各位小伙伴一些小小的启发。

1、做任何事情都要明确自己的目的,想要做什么,打算怎么做,做到什么样的程度,自己有一个清晰的定位,虽然计划永远赶不上变化,但是按计划走,见招拆招或许也是不错的选择。

2、本项目是爬取豆瓣的250部电影,将电影名,电影链接,评分等信息爬取保存到本地。将相关信息以列表的形式展示在网页上,访问者可通我的网站直接挑转到豆瓣查看电影,将评分制作评分走势图,将电影制作成词云图在网页上展示,共有五个网页,可相互跳转。

项目流程图:

在这里插入图片描述

数据爬取:

# -*- codeing = utf-8 -*-
# @Time : 2022/1/11 22:39
# @Author : lj
# @File : spider.PY
# @Software: 4{PRODUCT_NAME}
import bs4  # 网页解析,获取数据 对网页的数据进行拆分
import re   #正则表达式,进行文字匹配   对数据进行提炼
import urllib.request,urllib.error  #指定url 获取网页数据 怕网页
import xlwt  #进行excel 操作  存再excel中
import sqlite3 #进行sqllite 数据库操作 存在数据库中
import time
# 主函数
def main():
    # 调用函数
    url = "https://movie.douban.com/top250?start="
    datalist1 = allData(url)
    # savepath = "豆瓣电影top250.xls"
    # savedata(datalist1,savepath)
    dbpath = "move.db"
    savedatasql(datalist1,dbpath)
#匹配所需内容的正则表达式
linkpattern = re.compile(r'<a href="(.*?)/">')
#匹配图片的正则表达式
imagepattern = re.compile(r'<img .*src=".*"/>',re.S)#re.S 忽略换行符,.表示除了换行符以外的所有字符
#匹配影名的正则表达式
namepattern = re.compile(r'<span class="title">(.*)</span>')
# 影片评分
gradepattern = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# 评价人数
peoplepattern = re.compile(r'<span>(\d*)人评价</span>')#(\d*) 零个或多个
#概况
thinkpattern = re.compile(r'<span class="inq">(.*)</span>')
#影片的相关内容
contentpattern = re.compile(r'<p class="">(.*?)</p>',re.S)#忽略换行符

#1、爬取一个网页
def getData(url1):
    head = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"}
    request = urllib.request.Request(url1,headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
    return html  #返回给调用的地方
# 2、爬取所有网页,匹配分析
def allData(url):
    datalist = []
    for i in range(0, 10):  # 左闭右开 调用十次,每次二十五条信息
        url1 = url + str(i * 25)
        html = getData(url1) #保存获取到的网页源码
        time.sleep(1)
        # 逐页解析
        soup = bs4.BeautifulSoup(html,"html.parser") #返回树型结构
        for item in soup.find_all('div',class_="item"): #查找符合要求的字符串,返回列表,class加下划线
            data = []
            item = str(item)
            #link 获取到影片的超链接
            link = re.findall(linkpattern,item)[0]
            data.append(link)
            # 影片图片
            image = re.findall(imagepattern,item)[0]
            data.append(image)
            # 影片名
            name = re.findall(namepattern,item)
            if(len(name)==2):
                chinaname = name[0]
                data.append(chinaname)
                outername = name[1].replace("/","")#.replace("/","") 列表内置的方法,将/替换为空""
                data.append(outername)
            else:
                data.append(name[0])
                data.append('  ')#外文名空出来
            # 影片评分
            grade = re.findall(gradepattern,item)[0]
            data.append(grade)
            # 影片评价人数
            people = re.findall(peoplepattern, item)[0]
            data.append(people)
            # 影片概况
            think = re.findall(thinkpattern, item)
            if len(think) != 0:
                think = think[0].replace("。","")
                data.append(think)
            else:
                data.append("  ")
            # 影片内容
            content = re.findall(contentpattern, item)[0]
            content = re.sub('<br(\s+)?/>(\s+)?'," ",content)#替换内容中多余的符号和内容
            content = re.sub('/'," ",content)
            data.append(content.strip())#去除列表中的空格
            datalist.append(data)
    return datalist
#3、保存数据到excel
# def savedata(datalist1,savepath):
#     workplace = xlwt.Workbook(encoding="utf-8",style_compression=0)#style_compression=0·压缩样式
#     worksheet = workplace.add_sheet("豆瓣电影top250",cell_overwrite_ok="true")#cell_overwrite_ok=true 是否可以覆盖
#     col = ('电影详情链接','电影图片链接','影片中文名','影片外文名','评分','评价人数','概况','影片内容')
#     for i in range(0,8):
#         worksheet.write(0,i,col[i])
#     for i in range(0,250):
#         print("打印了%d条" %(i+1))
#         databuffer = datalist1[i]
#         for j in range(0,8):
#             worksheet.write(i+1,j,databuffer[j])
#     workplace.save(savepath) #保存
#3、保存数据到数据库
def savedatasql(datalist1,dbpath):#dbpath 数据库的路径位置
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()#获取一个游标,存放sql语句的执行结果
    #cur 获得了游标可存放执行结果的对象
    #将datalist1中的数据依次遍历写入数据库
    for data in datalist1:
        for index in range(len(data)):

            if index == 4 or index == 5:
                continue
            # data[index] = "'" + data[index].replace(u'\xa0','') + "'"
            data[index] = data[index].replace("'", "")
            data[index] = "'"+data[index]+"'"

        sql = '''
                insert into move250(
                move_link,img_link,move_chinaname,move_foriername,grade,numbers,introduction,content)
                values (%s)'''%",".join(data)
        #print(sql)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()
    print("write successful")

#创建数据库
def init_db(dbpath):

    sql = '''
        create table move250
        (
        id integer primary key autoincrement,
        move_link text,
        img_link text,
        move_chinaname varchar,
        move_foriername varchar,
        grade numeric,
        numbers numeric,
        introduction text,
        content text
        )
    '''
    c = sqlite3.connect(dbpath)
    buffer = c.cursor()  # 获取游标
    buffer.execute(sql)  # 内置的方法执行sql语句
    c.commit()   #         数据提交,写入数据库
    c.close()   #          数据库关闭
    print("database create successful")
if __name__ == "__main__":
    #调用函数
    main()
    print("爬取完毕!")


数据库数据

在这里插入图片描述

excel数据

在这里插入图片描述

可视化制作

在这里插入图片描述

路由分配,网页渲染

import sqlite3

from flask import Flask,render_template

app = Flask(__name__)

@app.route('/')
def first():  # put application's code here
    return render_template("index-1.html")
# 每一个函数对应一个路由解析
@app.route('/index')
def first1():
    return render_template('index-1.html')

@app.route('/movie')
def mv():
    datalist =  []
    con = sqlite3.Connection('move.db')
    cursor = con.cursor()
    sql = "select * from move250"
    data = cursor.execute(sql)
    for items in data:
        datalist.append(items)
    cursor.close()
    con.close()

    return render_template('about.html',mmm = datalist)

@app.route('/score')
def sc():
    x = []
    y = []
    con = sqlite3.Connection('move.db')
    cursor = con.cursor()
    sql = "select grade,count(grade) from move250 group by grade"
    data1 = cursor.execute(sql)
    for ite in data1:
       x.append(ite[0])
       y.append(ite[1])
    cursor.close()
    con.close()
    return render_template('services.html',score = x,number = y)

@app.route('/word')
def wd():
    return render_template('projects-details.html')

@app.route('/team')
def te():
    return render_template('team.html')

if __name__ == '__main__':
    app.run()

网页源码,由于网页源码过多,只放一页

网页效果

在这里插入图片描述

<!DOCTYPE html>
<html lang="en">

<head>
    <style>
        #container{
            border: 2px solid red;
            height: 1000px;
            width: 1200px;
            top: 50px;
            left: 100px;
        }
    </style>
    
    <!-- ========== Meta Tags ========== -->
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="description" content="Buskey - Corporate Business Template">

    <!-- ========== Page Title ========== -->
    <title>Buskey - Corporate Business Template</title>

    <!-- ========== Start Stylesheet ========== -->
    <link href="../static/css/plugins.min.css" rel="stylesheet">
    <link href="../static/css/flaticon-business-set.css" rel="stylesheet">
    <link href="../static/css/style.css" rel="stylesheet">
    <link href="../static/css/responsive.css" rel="stylesheet">
    <!-- ========== End Stylesheet ========== -->

    <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
    <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
    <!--[if lt IE 9]>


 <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts@5/dist/echarts.min.js"></script>
    <![endif]-->

    <!-- ========== Google Fonts ========== -->
    <link href="../static/css/css.css" rel="stylesheet">
    <link href="../static/css/css1.css" rel="stylesheet">

</head>

<body>

    <!-- Preloader Start -->
    <div class="se-pre-con"></div>
    <!-- Preloader Ends -->

    <!-- Start Header Top 
    ============================================= -->
    <div class="top-bar-area bg-theme text-light">
        <div class="container">
            <div class="row">
                <div class="col-md-9">
                    <div class="info box">
                        <ul>
                            <li>
                                <div class="icon">
                                    <i class="fas fa-map-marker-alt"></i>
                                </div>
                                <div class="info">
                                    <p>
                                        china
                                    </p>
                                </div>
                            </li>
                            <li>
                                <div class="icon">
                                    <i class="fas fa-envelope-open"></i>
                                </div>
                                <div class="info">
                                    <p>
                                        1751108164@qq.com
                                    </p>
                                </div>
                            </li>
                            <li>
                                <div class="icon">
                                    <i class="fas fa-mobile-alt"></i>
                                </div>
                                <div class="info">
                                    <p>
                                        +123 456 7890
                                    </p>
                                </div>
                            </li>
                        </ul>
                    </div>
                </div>
                <div class="topbar-social col-md-3">

                </div>
            </div>
        </div>
    </div>
    <!-- End Header Top -->

    <!-- Header 
    ============================================= -->
    <header>

        <!-- Start Navigation -->
        <nav class="navbar navbar-default navbar-sticky bootsnav">

            <div class="container">

                <!-- Start Header Navigation -->
                <div class="navbar-header">
                    <button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#navbar-menu">
                        <i class="fa fa-bars"></i>
                    </button>
                    <a class="navbar-brand" href="index.html">
                        <img src="../static/picture/logo-light.png" class="logo logo-display" alt="Logo">
                    </a>
                </div>
                <!-- End Header Navigation -->

                <!-- Collect the nav links, forms, and other content for toggling -->
                <div class="collapse navbar-collapse" id="navbar-menu">
                    <ul class="nav navbar-nav navbar-right" data-in="#" data-out="#">
                        <li class="dropdown">
                            <a href="index-1.html" class="dropdown-toggle active" data-toggle="dropdown">首页</a>

                        </li>
                        <li>
                            <a href="about.html">电影列表</a>
                        </li>
                        <li>
                            <a href="">评分</a>
                        </li>
                        <li class="dropdown">
                            <a href="projects-details.html" class="dropdown-toggle" data-toggle="dropdown">词云</a>

                        </li>
                        <li class="dropdown">
                            <a href="team.html" class="dropdown-toggle" data-toggle="dropdown">团队</a>

                        </li>
                    </ul>
                </div><!-- /.navbar-collapse -->
            </div>

        </nav>
        <!-- End Navigation -->

    </header>
    <!-- End Header -->

    <!-- Start Breadcrumb
    ============================================= -->
    <div class="breadcrumb-area shadow dark bg-fixed text-center padding-xl text-light" style="background-image: url(../static/image/21.jpg);">
        <div class="container">
            <div class="row">
                <div class="col-md-6 col-sm-6 text-left">
                    <h1>影线评分</h1>
                </div>
                <div class="col-md-6 col-sm-6 text-right">

                </div>
            </div>
        </div>
    </div>
    <!-- End Breadcrumb -->

    <!-- Start Fun Factor -->

    <!-- Start Fun Factor -->
<div class="fun-factor-area default-padding text-center bg-fixed shadow theme-hard parallax parralax-shadow" data-parallax="scroll" style="background-image: url(../static/image/12.jpg);">
        <div class="container">
            <div class="row">
                <div class="col-md-12">
                    <a href="about.html">
                    <div class="col-md-3 col-sm-6 item">
                        <div class="fun-fact">
                            <i class="flaticon-world-map"></i>
                            <div class="timer" data-to="250" data-speed="5000"></div>
                            <span class="medium">经典电影</span>
                        </div>
                    </div>
                        </a>
                    <a href="services.html">
                    <div class="col-md-3 col-sm-6 item">
                        <div class="fun-fact">
                            <i class="flaticon-gears"></i>
                            <div class="timer" data-to="1" data-speed="5000"></div>
                            <span class="medium">评分统计</span>
                        </div>
                    </div>
                        </a>
                    <a href="projects-details.html">
                    <div class="col-md-3 col-sm-6 item">
                        <div class="fun-fact">
                            <i class="flaticon-id-card"></i>
                            <div class="timer" data-to="5693" data-speed="5000"></div>
                            <span class="medium">高频词汇</span>
                        </div>
                    </div>
                        </a>
                    <a href="team.html ">
                    <div class="col-md-3 col-sm-6 item">
                        <div class="fun-fact">
                            <i class="flaticon-id"></i>
                            <div class="timer" data-to="5" data-speed="5000"></div>
                            <span class="medium">专业团队</span>
                        </div>
                    </div>
                        </a>
                </div>
            </div>
        </div>
    </div>
    <!-- End Fun Factor -->
    <!-- Start Services
    ============================================= -->
    <div class="carousel-services-area bg-gray">
        <div class="container-box oh">
            <div class="carousel-service-items owl-carousel owl-theme">
                <div id="container" ></div>
            </div>
        </div>
    </div>
    <!-- End Services -->

    <!-- jQuery Frameworks
    ============================================= -->

     <!-- echarts 图表
    ============================================= -->
    <script src="../static/js/plugins.min.js"></script>
    <script src="../static/js/main.js"></script>
        <script type="text/javascript" src="echarts.min.js"></script>
        <!-- Uncomment this line if you want to dataTool extension
        <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts@5/dist/extension/dataTool.min.js"></script>
        -->
        <!-- Uncomment this line if you want to use gl extension
        <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts-gl@2/dist/echarts-gl.min.js"></script>
        -->
        <!-- Uncomment this line if you want to echarts-stat extension
        <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts-stat@latest/dist/ecStat.min.js"></script>
        -->
        <!-- Uncomment this line if you want to use map
        <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts@5/map/js/china.js"></script>
        <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts@5/map/js/world.js"></script>
        -->
        <!-- Uncomment these two lines if you want to use bmap extension
        <script type="text/javascript" src="https://api.map.baidu.com/api?v=2.0&ak=<Your Key Here>"></script>
        <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts@{{version}}/dist/extension/bmap.min.js"></script>
        -->

        <script type="text/javascript">
var dom = document.getElementById("container");
var myChart = echarts.init(dom);
var app = {};

var option;

option = {
  xAxis: {
    type: 'category',
    data: [8.3,8.4,8.5,8.6,8.7,8.8,8.9,9,9.1,9.2,9.3,9.4,9.5,9.6,9.7]
  },
  yAxis: {
    type: 'value'
  },
  series: [
    {
      data: [1,3,9,23,40,40,39,15,28,17,20,7,4,3,1,],
      type: 'bar',
      showBackground: true,
      backgroundStyle: {
        color: 'rgba(180, 180, 180, 0.2)'
      }
    }
  ]
};

option && myChart.setOption(option);
        </script>

</body>
</html>

网页的免费模板很多,可以挑选自己喜欢的进行修改,并插入图表等。

词云图片制作,本项目采用电影的简介制作

# -*- codeing = utf-8 -*-
# @Time : 2022/2/2 12:53
# @Author : lj
# @File : testcloud.PY
# @Software: 4{PRODUCT_NAME}
import jieba
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import sqlite3
# 主要的流程,1.jieba提取文字,2.生成词云
#准备好数据
con = sqlite3.connect('move.db')
cur = con.cursor()
sql = 'select introduction from move250'
data = cur.execute(sql)
text = ""
for item in data:
    text = text + item[0]
# print(text)
cur.close()
con.close()
#文本分词分句
cut = jieba.cut(text)
r = ' '.join(cut)

print(len(r))

# 处理图片的
img = Image.open(r'.\templates\TEST\img\img/ll.jpg')
img_arrays = np.array(img)  #将图片转换为数组


#词云库封装一个对象
wc =WordCloud(
    background_color='white',  #输出图片的颜色
     mask=img_arrays,    #导入处理好的图片
    font_path = 'STXINWEI.TTF' #字体的文件
)
#词云对象处理已经分好的词
wc.generate(r)


#绘制图片,plt matplotlib的库别名
fig = plt.figure(1)  #matplotlib库figure方法可绘图
plt.imshow(wc)   #按照wc的规则显示图片
plt.axis('off')  #不显示x轴

# plt.show() #生成的词云图片

#输出词云图片到文件中,设置清晰度
plt.savefig(r'.\templates\TEST\img\img/wordcloud2.jpg',dpi=800)

词云图展示:

在这里插入图片描述

将制作好的图片放在网页的合适网页展示即可


在这里插入图片描述

至此此项完成

问题:1、爬取之后在数据库中的数据无法在网页上显示。

2、先查看网页后在网页中跳转没有问题,但是通过端口路径访问页面之后跳转到其他网页就无法跳转。欢迎大佬指正。

Logo

为开发者提供学习成长、分享交流、生态实践、资源工具等服务,帮助开发者快速成长。

更多推荐