文章目录[隐藏]
关键词:python,python爬取图片,爬取图片
python爬取图片
import requests from random import choice, randint from lxml import etree import os from concurrent.futures import ThreadPoolExecutor from time import sleep if not os.path.exists('华晨宇的照片'): os.mkdir('华晨宇的照片') # 获取5页的套图的URL def get_taotu_url(): taotu_urls = [] for i in range(1, 6): url = f'http://www.gaodaima.com/mt/hcy_{i}.html' #此处可根据网站修改 # 发送请求 获取响应 rep = requests.get(url) # print(rep.status_code) 状态码 200 # print(rep.text) html = etree.HTML(rep.text) taotu_url = html.xpath('//div[@class="tab_tj"]/div/div/ul/li/a/@href') # 过滤掉无效的url taotu_url = [item for item in taotu_url if len(item) == 39] # 一个页面有24个图片 # print(taotu_url, len(taotu_url), sep='\n') taotu_urls.extend(taotu_url) return taotu_urls # 进入套图详情页爬取图片 def get_img(url): # 发送请求 获取响应 rep = requests.get(url) # 解析响应 html = etree.HTML(rep.text) # 获取套图名称 最大页数 name = html.xpath('//div[@class="ptitle"]/h1/text()')[0] os.mkdir(r'./华晨宇的照片/{}'.format(name)) max_page = html.xpath('//div[@class="ptitle"]/em/text()') # 字符串替换 便于之后构造url请求 url1 = url.replace('.html', '_{}.html') # 翻页爬取这组套图的图片 for i in range(1, int(max_page[0]) + 1): # 构造url url2 = url1.format(i) # 休眠 sleep(randint(1, 3)) # 发送请求 获取响应 reps = requests.get(url2) # 解析响应 dom = etree.HTML(reps.text) # 定位提取图片下载链接 src = dom.xpath('//div[@class="main-wrap"]/div[1]/a/img/@data-original')[0] # 构造图片保存的名称 file_name = name + f'第{i}张.jpg' # 请求下载图片 保存图片 输出提示信息 img = requests.get(src).content with open(r'./华晨宇的照片/{}/{}'.format(name, file_name), 'wb') as f: f.write(img) print(f'成功下载图片:{file_name}') # 主函数调用 开多线程 def main(): taotu_urls = get_taotu_url() with ThreadPoolExecutor(max_workers=4) as exector: exector.map(get_img, taotu_urls) print('=================== 图片全部下载成功啦! =====================') if __name__ == '__main__': main()
来源搞代码网《python爬取图片》http://www.gaodaima.com/68588.html