本文实例讲述了Python实现从订阅源下载图片的方法。分享给大家供大家参考。具体如下:
这段代码是基于python 3.4实现的,和python2.X 比起来有了好多差别啊。
这是一个练习,数据源来自网易订阅。代码如下:
__author__ = 'Saint'<br />import os<br />import urllib.request<br />import json<br />from html.parser import HTMLParser<br /># 从获取的网页内容筛选图片的内容<br />class MyHtmlParser(HTMLParser):<br /> links = []<br /> def handle_starttag(self, tag, attrs):<br /> if tag == "img":<br /> if len(attrs) == 0:<br /> pass<br /> else:<br /> for name, value in attrs:<br /> if name == "src":<br /> self.links.append(value)<br />class Down(object):<br /> # 总的目录<br /> img_path = "E:/saint"<br /> # 下载目录<br /> dir = ''<br /> # 采集源地址<br /> collect_links = ["http://dy.163.com/v2/media/articlelist/T1374483113516-1", "http://dy.163.com/v2/media/articlelist/T1420776257254-1", "http://dy.163.com/v2/media/articlelist/T1376641060407-1"]<br /> img_links = "http://dy.163.com/v2/article"<br /> def handleCollect(self):<br /> for collect_link in self.collect_links:<br /> notice = "开始从[" + collect_link + "]采集图片"<br /> print(notice)<br /> # 建立下载的目录<br /> dir_name = collect_link.split("/")[-1]<br /> self.isDirExists(dir_name)<br /> dict = self.getListFromSubscribe(collect_link)<br /> if dict == False:<br /> print("数据采集失败,是否继续(y/n)")<br /> op = input();<br /> if op == "y":<br /> os.system("cls")<br /> pass<br /> elif op == "n":<br /> print("停止采集")<br /> break<br /> else:<br /> os.system("cls")<br /> print("非法输入")<br /> break<br /> else:<br /> for page in dict:<br /> page_uri = self.img_links + "/" + page["tid"] + "/" + page["docid"]<br /> self.getImgFromUri(page_uri)<br /> print("是否继续(y/n)")<br /> new_op = input();<br /> if new_op == "n":<br /> os.system("cls")<br /> print("采集完毕")<br /> break<br /> print("OK")<br /> # 从订阅源获取目录<br /> def getListFromSubscribe(self, uri):<br /> res = urllib.request.urlopen(uri)<br /> if res.code 300:<br /> os.system("clear")<br /> return False<br /> else:<br /> result = res.read().decode("gbk") # 3.4版本的read()返回的是byte类型,需要decode()处理,选项是网页编码<br /> dict = json.loads(result)<br /> if dict['code'] != 1:<br /> print(dict['msg'])<br /> return False<br /> else:<br /> return dict['data']<br /> # 获取本期订阅的网页,并从网页中提取出来需要的图片<br /> def getImgFromUri(self, uri):<br /> html_code = urllib.request.urlopen(uri).read().decode("gbk")<br /> hp = MyHtmlParser()<br /> hp.feed(html_code)<br /> hp.close()<br /> <br /> for link in hp.links: # hp.links 是图片的下载地址的列表<br /> self.writeToDisk(link)<br /> # 检查文件目录是否存在,如果不存在,则创建目录<br /> def isDirExists(self, dir_name):<br /> self.dir = self.img_path + dir_name<br /> isExists = os.path.exists(self.dir)<br /> if not isExists:<br /> os.makedirs(self.dir)<br /> <i style="color:transparent">本文来源gaodai$ma#com搞$代*码*网(</i> return True<br /> else:<br /> return True<br /> # 下载文件,并且写入磁盘<br /> def writeToDisk(self, url):<br /> os.chdir(self.dir)<br /> file = urllib.request.urlopen(url).read()<br /> file_name = url.split("/")[-1]<br /> open(file_name, "wb").write(file)<br /> return True<br />if __name__ == "__main__":<br /> down = Down()<br /> down.handleCollect()
希望本文所述对大家的Python程序设计有所帮助。