使用Python3编写抓取网页和只抓网页图片的脚本

最基本的抓取网页内容的代码实现：

#!/usr/bin/env python  from urllib import urlretrieve  def firstNonBlank(lines):   for eachLine in lin<mark style="color:transparent">本文来源gaodaimacom搞#^代%!码&网*</mark>es:     if not eachLine.strip():       continue     else:       return eachLine  def firstLast(webpage):   f = open(webpage)   lines = f.readlines()   f.close()   print firstNonBlank(lines),   lines.reverse()   print firstNonBlank(lines),  def download(url='http://www',process=firstLast):   try:     retval = urlretrieve(url)[0]   except IOError:     retval = None   if retval:     process(retval)  if __name__ == '__main__':   download()

利用urllib模块，来实现一个网页中针对图片的抓取功能：

import urllib.request import socket import re import sys import os targetDir = r"C:\Users\elqstux\Desktop\pic" def destFile(path):   if not os.path.isdir(targetDir):     os.mkdir(targetDir)   pos = path.rindex('/')   t = os.path.join(targetDir, path[pos+1:])   return t  if __name__ == "__main__":   hostname = "http://www.douban.com"   req = urllib.request.Request(hostname)   webpage = urllib.request.urlopen(req)   contentBytes = webpage.read()   for link, t in set(re.findall(r'(http:[^\s]*?(jpg|png|gif))', str(contentBytes))):     print(link)     urllib.request.urlretrieve(link, destFile(link))

import urllib.request import socket import re import sys import os targetDir = r"H:\pic" def destFile(path):   if not os.path.isdir(targetDir):     os.mkdir(targetDir)   pos = path.rindex('/')   t = os.path.join(targetDir, path[pos+1:]) #会以/作为分隔   return t  if __name__ == "__main__":   hostname = "http://www.douban.com/"   req = urllib.request.Request(hostname)   webpage = urllib.request.urlopen(req)   contentBytes = webpage.read()   match = re.findall(r'(http:[^\s]*?(jpg|png|gif))', str(contentBytes) )#r'(http:[^\s]*?(jpg|png|gif))'中包含两层圆括号，故有两个分组，                              #上面会返回列表，括号中匹配的内容才会出现在列表中   for picname, picType in match:     print(picname)     print(picType)      ''''' 输出： http://img3.douban.com/pics/blank.gif gif http://img3.douban.com/icon/g111328-1.jpg jpg http://img3.douban.com/pics/blank.gif gif http://img3.douban.com/icon/g197523-19.jpg jpg http://img3.douban.com/pics/blank.gif gif ... '''

搞代码网（gaodaima.com）提供的所有资源部分来自互联网，如果有侵犯您的版权或其他权益，请说明详细缘由并提供版权或权益证明然后发送到邮箱[email protected]‍，我们会在看到邮件的第一时间内为您处理，或直接联系QQ：872152909。本网站采用BY-NC-SA协议进行授权
转载请注明原文链接：使用Python3编写抓取网页和只抓网页图片的脚本

Hi，您需要填写昵称和邮箱！