<span>#</span><span> 1.创建请求对象(Request())</span> url = <span>"</span><span>http://...</span><span>"</span> <span>#</span><span> 1.1 添加多个请求头,每次放一个进行访问</span><span> #</span><span> list = [<a href="https://www.gaodaima.com/tag/agent" title="查看更多关于agent的文章" target="_blank">agent</a>1,agent2,agent3,agent4,agent5]</span><span> #</span><span> agent = random.choice(list)</span> <span> <a href="https://www.gaodaima.com/tag/headers" title="查看更多关于headers的文章" target="_blank">headers</a> </span>=<span> { </span><span>"</span><span>User-Agent</span><span>"</span>: <span>""</span>, <span>#</span><span> 伪装,反爬虫机制 # 1.1 "User-Agent":agent,</span> <span>"</span><span>Cookie</span><span>"</span>: <span>""</span>, <span>#</span><span> Cookie模拟登陆</span> <span>} </span><span>#</span><span> 1.2创建自定义请求对象</span> req = urllib.<a href="https://www.gaodaima.com/tag/request" title="查看更多关于request的文章" target="_blank">request</a>.Request(url, headers=<span>headers) </span><span>#</span><span> 2.获取响应对象(urlopen())</span> res =<span> urllib.request.urlopen(req) </span><span>#</span><span> 3.获取内容(read().decode("utf-8")</span> html = res.read().decode(<span>"</span><span>utf-8</span><span>"</span><span>) </span><span>#</span><span> decode() : bytes -> string</span><span> #</span><span> encode() : string -> bytes</span> <span>#</span><span> 2-3.可结合</span><span> #</span><span> html = request.urlopen(req).read().decode("utf-8")</span> <span>print</span>(html)
www#gaodaima.com来源gao@daima#com搞(%代@#码网搞代码
一、python爬虫基础步骤
<span>#</span><span> 1.构建处理器对象(专门处理请求的对象)</span> http_hander =<span> request.HTTPHandler() </span><span>#</span><span> 2.创建自定义opener</span> opener =<span> request.build_opener(http_hander) </span><span>#</span><span> 3.创建自定义请求对象</span> req = request.Request(<span>"</span><span>http://www.baidu.com</span><span>"</span><span>) </span><span>#</span><span> 4.1 发送请求,获取响应</span><span> #</span><span> reponse = opener.open(req).read()</span> <span>#</span><span> 4.2 把自定义opener设置为全局,这样urlopen发送的请求也会使用自定义的opener</span> <span>request.install_opener(opener) reponse </span>=<span> request.urlopen(req).read() </span><span>print</span>(reponse)
二、自定义opener
<span>#</span><span> 1.接收用户从终端输入</span> key = input(<span>"</span><span>请输入要搜索的内容:</span><span>"</span><span>) wd </span>= {<span>"</span><span>wd</span><span>"</span>: key} <span>#</span><span> dict</span> url = <span>"</span><span>http://www.baidu.com/s?</span><span>"</span> <span>#</span><span> 2.构造url编码,进行urlencode编码</span> wdd =<span> urllib.parse.urlencode(wd) </span><span>#</span><span> 3.拼接url</span> url = url+<span>wdd </span><span>#</span><span> 4.创建请求对象</span> req =<span> request.Request(url) </span><span>#</span><span> 5.获取响应对象</span> reponse =<span> request.urlopen(req).read().decode() </span><span>print</span>(reponse)
三、处理get请求,进行urlencode编码
<span>#</span><span> 1.构造请求头信息</span> header=<span>{ </span><span>"</span><span>User-Agent</span><span>"</span>: <span>"</span><span>Mozilla/5.0 (Windows NT 10.0; WOW64) </span><span>"</span> <span>"</span><span>AppleWebKit/537.36 (KHTML, like Gecko) </span><span>"</span> <span>"</span><span>Chrome/79.0.3928.4 Safari/537.36</span><span>"</span><span> } url </span>= <span>"</span><span>http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule</span><span>"</span> <span>#</span><span> 2.接收用户从终端输入</span> key = input(<span>"</span><span>请输入要搜索的内容:</span><span>"</span><span>) formdata</span>=<span>{ </span><span>"</span><span>i</span><span>"</span><span>: key, </span><span>"</span><span>from</span><span>"</span>: <span>"</span><span>AUTO</span><span>"</span><span>, </span><span>"</span><span>to</span><span>"</span>: <span>"</span><span>AUTO</span><span>"</span><span>, </span><span>"</span><span>smartresult</span><span>"</span>: <span>"</span><span>dict</span><span>"</span><span>, </span><span>"</span><span>client</span><span>"</span>: <span>"</span><span>fanyideskweb</span><span>"</span><span>, </span><span>"</span><span>salt</span><span>"</span>: <span>"</span><span>16003477829589</span><span>"</span><span>, </span><span>"</span><span>sign</span><span>"</span>: <span>"</span><span>3f351e5f7e0d84706ef063ccabe3e169</span><span>"</span><span>, </span><span>"</span><span>lts</span><span>"</span>: <span>"</span><span>1600347782958</span><span>"</span><span>, </span><span>"</span><span>bv</span><span>"</span>: <span>"</span><span>cb9a601990a9118249221b303a87fd75</span><span>"</span><span>, </span><span>"</span><span>doctype</span><span>"</span>: <span>"</span><span>json</span><span>"</span><span>, </span><span>"</span><span>version</span><span>"</span>: <span>"</span><span>2.1</span><span>"</span><span>, </span><span>"</span><span>keyfrom</span><span>"</span>: <span>"</span><span>fanyi.web</span><span>"</span><span>, </span><span>"</span><span>action</span><span>"</span>: <span>"</span><span>FY_BY_REALTlME</span><span>"</span><span>, } </span><span>#</span><span> 3.把data转为bytes数据类型</span> data = urllib.parse.urlencode(formdata).encode(encoding=<span>"</span><span>utf-8</span><span>"</span><span>) </span><span>#</span><span> 4.发请求,获响应,获取内容</span> req = request.Request(url,data=data,headers=<span>header) resp </span>=<span> request.urlopen(req).read().decode() </span><span>#</span><span> 5.正则表达式,提取"tgt":"like"}]]}中间的任意内容</span> pat = r<span>"</span><span>"tgt":"(.*?)"}]]}</span><span>"</span><span> result </span>=<span> re.findall(pat,resp) </span><span>print</span>(result[0])
四、处理post请求,有道翻译
list1 =<span> [ </span><span>"</span><span>http://www.baidu.com</span><span>"</span><span>, </span><span>"</span><span>http://www.baidu.com</span><span>"</span><span>, </span><span>"</span><span>http://www.baidu25234234235454254243.com</span><span>"</span><span>, </span><span>"</span><span>http://www.baidu.com</span><span>"</span><span>, </span><span>"</span><span>http://www.baidu.com</span><span>"</span><span>, ] i </span>=<span> 0 </span><span>for</span> url <span>in</span><span> list1: i </span>+= 1 <span>try</span><span>: request.urlopen(url) </span><span>except</span><span> Exception as e: </span><span>print</span><span>(e) </span><span>print</span>(<span>"</span><span>第</span><span>"</span>,i,<span>"</span><span>此请求完成</span><span>"</span>)
–异常处理
base_url = <span>"</span><span>https://movie.douban.com/j/chart/top_list?</span><span>"</span><span> </span><span>"</span><span>type=11&interval_id=100%3A90&action=&start={}&limit=20</span><span>"</span><span> header </span>=<span> { </span><span>"</span><span>User-Agent</span><span>"</span>: <span>"</span><span>Mozilla/5.0 (Windows NT 10.0; WOW64) </span><span>"</span> <span>"</span><span>AppleWebKit/537.36 (KHTML, like Gecko)</span><span>"</span> <span>"</span><span> Chrome/79.0.3928.4 Safari/537.36</span><span>"</span><span> } i </span>=<span> 0 </span><span>while</span><span> True: url </span>=base_url.format(i * 20<span>) </span><span>#</span><span> "网站名:{name}, 地址 {url}".format(name="菜鸟教程", url="www.runoob.com")</span> req = request.Request(url,headers=<span>header) res </span>=<span> request.urlopen(req).read().decode() </span><span>print</span><span>(res) </span><span>if</span> res == <span>""</span> <span>or</span> res <span>is</span><span> None: </span><span>break</span><span> i </span>+= 1
五、ajax请求的使用
<span>import</span><span> ssl url </span>= <span>"</span><span>https://www.12306.cn/mormhweb/</span><span>"</span><span> header </span>=<span> { </span><span>"</span><span>User-Agent</span><span>"</span>: <span>"</span><span>Mozilla/5.0 (Windows NT 10.0; WOW64) </span><span>"</span> <span>"</span><span>AppleWebKit/537.36 (KHTML, like Gecko)</span><span>"</span> <span>"</span><span> Chrome/79.0.3928.4 Safari/537.36</span><span>"</span><span> } req </span>= request.Request(url,headers=<span>header) </span><span>#</span><span> 验证忽略证书</span> context =<span> ssl._create_unverified_context() res </span>= request.urlopen(req,context=<span>context).read().decode() </span><span>print</span>(res)
六、https请求的使用
url = <span>"</span><span>https://www.qiushibaike.com/text/</span><span>"</span><span> header </span>=<span> { </span><span>"</span><span>User-Agent</span><span>"</span>: <span>"</span><span>Mozilla/5.0 (Windows NT 10.0; Win64;</span><span>"</span> <span>"</span><span> x64; rv:80.0) Gecko/20100101 </span><span>"</span> <span>"</span><span>Firefox/80.0</span><span>"</span><span> } </span><span>#</span><span> 构造请求</span> res = requests.get(url,headers=<span>header) info </span>=<span> res.text infos </span>= re.findall(r<span>"</span><span><div class="content">s*<span>s*(.+)s*</span></span><span>"</span><span>,info) </span><span>for</span> info <span>in</span><span> infos: with open(</span><span>"</span><span>duanzi.txt</span><span>"</span>,<span>"</span><span>a</span><span>"</span>,encoding=<span>"</span><span>utf-8</span><span>"</span><span>) as f: f.write(info </span>+ <span>"</span><span> </span><span>"</span><span>) </span><span>print</span>(infos)
七、糗事百科案例