今天下午用python写了一个爬桌面壁纸的爬虫。
十分的简略,毕竟大部分的网站都没有反爬策略的。
import requests
from lxml import etree
import re
import time
url = ‘https://wallhaven.cc/toplist’
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
def getmaxlistnum():
makelistnumlink = 'https://wallhaven.cc/toplist?page=2' r = requests.get(makelistnumlink, headers=headers) r = etree.HTML(r.content.decode('utf8')) list_num_xpath = '/html/body/main/div[1]/section/header/h2/text()[2]' list_num = r.xpath(list_num_xpath) list_num = list_num.pop() list_num = re.findall('([0-9]{1,4})', list_num).pop() return list_num
list_num = getmaxlistnum()
print(‘目前一共有’+list_num+’页壁纸’)
def writefilespng(endlink,imgname,num):
count = 1 while count < 4: try: bgimg = requests.get(endlink, headers=headers, timeout=5).content break except requests.exceptions.RequestException: count += 1 time.sleep(30) with open(imgname + '.png', 'wb') as mh: mh.write(bgimg) print('已实现'+imgname+'.png')
def writefilesjpg(endlink,imgname,num):
count = 1 while count < 4: try: bgimg = requests.get(endlink, headers=headers, timeout=5).content break except requests.exceptions.RequestException: count += 1 time.sleep(30) with open(imgname + '.jpg', 'wb') as mh: mh.write(bgimg) print('已实现'+imgname+'.jpg')
def makebgimg(url,num):
backgroundimgurl_xpath = '//*[@id="thumbs"]/section/ul/li/figure/a/@href' r = requests.get(url, headers=headers) r = etree.HTML(r.content.decode('utf8')) backgroundimgurl = r.xpath(backgroundimgurl_xpath) endlink_xpath = '//*[@id="wallpaper"]/@src' for bgurl in range(len(backgroundimgurl)): everylink = backgroundimgurl.pop() r = requests.get(everylink, headers=headers) r = etree.HTML(r.content.decode('utf8')) endlink = r.xpath(endlink_xpath).pop() bgimginfo = re.findall('.*[a-z0-9]{6}.([pngjp]{3})', endlink).pop() if bgimginfo == 'png': imgname = re.findall('.*([a-z0-9]{6}).png', endlink).pop() writefilespng(endlink, imgname,num) elif bgimginfo == 'jpg': imgname = re.findall('.*([a-z0-9]{6}).jpg', endlink).pop() writefilesjpg(endlink, imgname,num)
def makelink(wantget):
urllist = [] pagenum = re.findall('([0-9])', wantget) pagenum = list(set(pagenum)) pagenum.sort() maxpagenum = int(pagenum.pop()) minpagenum = int(pagenum.pop(0)) for i in range(minpagenum, maxpagenum + 1): newurl = url+'?page='+str(i) urllist.append(newurl) return urllist
def mainbk():
print('*'*30) print('壁纸网站:https://wallhaven.cc/toplist') print('只是爬取toplist') print('注:输出的是一个范畴,如果想要独自的页码请只输出一个数字') wantget = input('请输出你想爬取的页数,如1-10代表爬取1-10页,1-1代表爬取第一页n不要用123456这类页码示意,举荐1-2,1.2.3之类的n请输出:') urllist = makelink(wantget) for num in range(len(urllist)): url = urllist.pop() makebgimg(url,num)
mainbk()
大体思路就是下面的代码了,首先是获取页码,而后通过re来进行获取一个列表,而后创立链接,接下来传入到创立图片的函数里,进行操作,目前我测试应该没什么大问题。
不过能够将sleep去掉,或者批改工夫少一些,不然有的时候会很急人的。
对于python爬取桌面壁纸就简略说到这里,爬虫基本上会一个,其余的都会写了,大体思路都是相通的。
python 爬取桌面壁纸