python爬虫
简单获取网页源代码
>>> import urllib.request      #导入相应类库
>>> response = urllib.request.urlopen("http://www.baidu.com")
>>> html = response.read()
>>> print(html)       #输出数据均为二进制形式

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n<meta charset="UTF-8">\n<meta name="apple-mobile-web-app-capable" content="yes">\n<meta name="apple-touch-fullscreen" content="yes">\n<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0">\n<meta name="keywords" content="\xe9\xb1\xbcC\xe5\xb7\xa5\xe4\xbd\x9c\xe5\xae\xa4|\xe5\x85\x8d\xe8\xb4\xb9\xe7\xbc\x96\xe7\xa8\x8b\xe8\xa7\x86\xe9\xa2\x91\xe6\x95\x99\xe5\xad\xa6|Python\xe6\x95\x99\xe5\xad\xa6|Web\xe5\xbc\x80\xe5\x8f\x91\xe6\x95\x99\xe5\xad\xa6|\xe5\x85\xa8\xe6\xa0\x88\xe5\xbc\x80\xe5\x8f\x91\xe6\x95\x99\xe5\xad\xa6|C\xe8\xaf\xad\xe8\xa8\x80\xe6\x95\x99\xe5\xad\xa6|\xe6\xb1\x87\xe7\xbc\x96\xe6\x95\x99\xe5\xad\xa6|Win32\xe5\xbc\x80\xe5\x8f\x91|\xe5\x8a\xa0\xe5\xaf\x86\xe4\xb8\x8e\xe8\xa7\xa3\xe5\xaf\x86|Linux\xe6\x95\x99\xe5\xad\xa6">\n<meta name="description" content="\xe9\xb1\xbcC\xe5\xb7\xa5\xe4\xbd\x9c\xe5\xae\xa4\xe4\xb8\xba\xe5\xa4\xa7\xe5\xae\xb6\xe6\x8f\x90\xe4\xbe\x9b\xe6\x9c\x80\xe6\x9c\x89\xe8\xb6\xa3\xe7\x9a\x84\xe7\xbc\x96\xe7\xa8\x8b\xe8\xa7\x86\xe9\xa2\x91\xe6\x95\x99\xe5\xad\xa6\xe3\x80\x82">\n<meta name="author" content="\xe9\xb1\xbcC\xe5\xb7\xa5\xe4\xbd\x9c\xe5\xae\xa4">\n<title>\xe9\xb1\xbcC\xe5\xb7\xa5\xe4\xbd\x9c\xe5\xae\xa4-\xe5\x85\x8d\xe8\xb4\xb9\xe7\xbc\x96\xe7\xa8\x8b\xe8\xa7\x86\xe9\xa2\x91\xe6\x95\x99\xe5\xad\xa6|Python\xe6\x95\x99\xe5\xad\xa6|Web\xe5\xbc\x80\xe5\x8f\x91\xe6\x95\x99\xe5\xad\xa6|\xe5\x85\xa8\xe6\xa0\x88\xe5\xbc\x80\xe5\x8f\x91\xe6\x95\x99\xe5\xad\xa6|C\xe8\xaf\xad\xe8\xa8\x80\xe6\x95\x99\xe5\xad\xa6|\xe6\xb1\x87\xe7\xbc\x96\xe6\x95\x99\xe5\xad\xa6|Win32\xe5\xbc\x80\xe5\x8f\x91|\xe5\x8a\xa0\xe5\xaf\x86\xe4\xb8\x8e\xe8\xa7\xa3\xe5\xaf\x86|Linux\xe6\x95\x99\xe5\xad\xa6</title>\n<link rel="shortcut icon" type="image/x-icon" href="img/favicon.ico">\n<link rel="stylesheet" href="css/styles.css">\n<script src="js/jq.js"></script>\n<script src="js/fishcEgg.js"></script>\n<style>\n        html,\n        body {\n            height: 100%;\n            padding: 0px;\n            margin: 0px;\n        }\n    </style>\n</head>\n<body>\n<img src="img/tradeMark.png" alt="trademark" usemap="#myTrademark" style="position:absolute; z-index: 99999; position: fixed; top: 15px; border: 0; right:0; display: block;">\n<map name="myTrademark">\n<area shape="poly" coords="8,0,45,0,196,148,196,185" href="http://fishc.taobao.com" alt="TaoBao" target="_blank">\n<area shape="poly" coords="67,0,98,0,196,97,196,129" href="https://fishc.com.cn/thread-1053-1-1.html" alt="Plan" target="_blank">\n</map>\n<img src="img/tR.gif" alt="trademark" usemap="#myTrademark" style="position:absolute; z-index: 9999; position: fixed; top: 15px; border: 0; right:0; display: block;">\n<map name="myTrademark">\n<div id="timeline-embed"></div>\n<script type="text/javascript">\n\n            window.onload = function () {\n                $(".storyjs-embed.sized-embed").css("padding-top", "0");\n                $(".vco-storyjs .vco-feature .vco-slide").css("padding-top", "0");\n            };\n\n            var timeline_config = {\n                width: "100%",\n                height: "100%",\n                start_at_end: true,\n                source: \'fishc.json\'\n            }\n\n            // \xe6\xb5\x8f\xe8\xa7\x88\xe5\x99\xa8\xe5\x88\xa4\xe6\x96\xad\xef\xbc\x8c\xe5\xa6\x82\xe6\x9e\x9c\xe6\x98\xafIE\xe5\xbc\xb9\xe5\x87\xba\xe6\x8f\x90\xe7\xa4\xba\xe6\xa1\x86\n            function getExplore() {\n                var Sys = {};\n                var ua = navigator.userAgent.toLowerCase();\n                var s;\n                (s = ua.match(/rv:([\\d.]+)\\) like gecko/)) ? Sys.ie = s[1] :\n                    (s = ua.match(/msie ([\\d\\.]+)/)) ? Sys.ie = s[1] :\n                        (s = ua.match(/edge\\/([\\d\\.]+)/)) ? Sys.edge = s[1] :\n                            (s = ua.match(/firefox\\/([\\d\\.]+)/)) ? Sys.firefox = s[1] :\n                                (s = ua.match(/(?:opera|opr).([\\d\\.]+)/)) ? Sys.opera = s[1] :\n                                    (s = ua.match(/chrome\\/([\\d\\.]+)/)) ? Sys.chrome = s[1] :\n                                        (s = ua.match(/version\\/([\\d\\.]+).*safari/)) ? Sys.safari = s[1] : 0;\n                // \xe6\xa0\xb9\xe6\x8d\xae\xe5\x85\xb3\xe7\xb3\xbb\xe8\xbf\x9b\xe8\xa1\x8c\xe5\x88\xa4\xe6\x96\xad\n                if (Sys.ie) alert(\'\xe8\xaf\xb7\xe4\xbd\xbf\xe7\x94\xa8\xe9\x9d\x9eIE\xe6\xb5\x8f\xe8\xa7\x88\xe5\x99\xa8\xe6\x89\x93\xe5\xbc\x80\xe6\x9c\xac\xe4\xb8\xbb\xe9\xa1\xb5\');\n\n            }\n            getExplore();\n        </script>\n<script type="text/javascript" src="build/js/storyjs-embed.js"></script>\n<div class="myICP">\n<a href="http://beian.miit.gov.cn/state/outPortal/loginPortal.action" target="_blank">\xe7\xb2\xa4ICP\xe5\xa4\x8718085999\xe5\x8f\xb7-2</a>\n</div>\n</body>\n</html>'
将数据格式化
>>> html = html.decode("utf-8")
>>> print(html)

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-touch-fullscreen" content="yes">
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0">
<meta name="keywords" content="鱼C工作室|免费编程视频教学|Python教学|Web开发教学|全栈开发教学|C语言教学|汇编教学|Win32开发|加密与解密|Linux教学">
<meta name="description" content="鱼C工作室为大家提供最有趣的编程视频教学。">
<meta name="author" content="鱼C工作室">
<title>鱼C工作室-免费编程视频教学|Python教学|Web开发教学|全栈开发教学|C语言教学|汇编教学|Win32开发|加密与解密|Linux教学</title>
<link rel="shortcut icon" type="image/x-icon" href="img/favicon.ico">
<link rel="stylesheet" href="css/styles.css">
<script src="js/jq.js"></script>
<script src="js/fishcEgg.js"></script>
Logo

为开发者提供学习成长、分享交流、生态实践、资源工具等服务,帮助开发者快速成长。

更多推荐