==================导入相干库==================================
from bs4 import BeautifulSoup
import numpy as np
import requests
from requests.exceptions import RequestException
import pandas as pd
=============读取网页=========================================
def craw(url,page):
try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"} html1 = requests.request("GET", url, headers=headers,timeout=10) html1.encoding ='utf-8' # 加编码,重要!转换为字符串编码,read()失去的是byte格局的 html=html1.text return html except RequestException:#其余问题 print('第{0}读取网页失败'.format(page)) return None
==========解析网页并保留数据到表格======================
def pase_page(url,page):
html=craw(url,page) html = str(html) if html is not None: soup = BeautifulSoup(html, 'lxml') "--先确定房子信息,即li标签列表--" houses=soup.select('.resblock-list-wrapper li')#房子列表 "--再确定每个房子的信息--" for j in range(len(houses)):#遍历每一个房子 house=houses[j] "名字" recommend_project=house.select('.resblock-name a.name') recommend_project=[i.get_text()for i in recommend_project]#名字 英华天元,斌鑫江南御府... recommend_project=' '.join(recommend_project) #print(recommend_project) "类型" house_type=house.select('.resblock-name span.resblock-type') house_type=[i.get_text()for i in house_type]#写字楼,底商... house_type=' '.join(house_type) #print(house_type) "销售状态" sale_status = house.select('.resblock-name span.sale-status') sale_status=[i.get_text()for i in sale_status]#在售,在售,售罄,在售... sale_status=' '.join(sale_status) #print(sale_status) "大地址" big_address=house.select('.resblock-location span') big_address=[i.get_text()for i in big_address]# big_address=''.join(big_address) #print(big_address) "具体地址" small_address=house.select('.resblock-location a') small_address=[i.get_text()for i in small_address]# small_address=' '.join(small_address) #print(small_address) "劣势。" advantage=house.select('.resblock-tag span') advantage=[i.get_text()for i in advantage]# advantage=' '.join(advantage) #print(advantage) "均价:多少1平" average_price=house.select('.resblock-price .main-price .number') average_price=[i.get_text()for i in average_price]#16000,25000,价格待定.. average_price=' '.join(average_price) #print(average_price) "总价,单位万" total_price=house.select('.resblock-price .second') total_price=[i.get_text()for i in total_price]#总价400万/套,总价100万/套'... total_price=' '.join(total_price) #print(total_price) #=====================写入表格================================================= information = [recommend_project, house_type, sale_status,big_address,small_address,advantage,average_price,total_price] information = np.array(information) information = information.reshape(-1, 8) information = pd.DataFrame(information, columns=['名称', '类型', '销售状态','大地址','具体地址','劣势','均价','总价']) information.to_csv('贵阳房价.csv', mode='a+', index=False, header=False) # mode=[黄金](https://www.gendan5.com/nmetal/gold.html)'a+'追加写入 print('第{0}页存储数据胜利'.format(page)) else: print('解析失败')
==================双线程=====================================
import threading
for i in range(1,100,2):#遍历网页1-101
url1="https://gy.fang.lianjia.com/loupan/pg"+str(i)+"/" url2 = "https://gy.fang.lianjia.com/loupan/pg" + str(i+1) + "/" t1 = threading.Thread(target=pase_page, args=(url1,i))#线程1 t2 = threading.Thread(target=pase_page, args=(url2,i+1))#线程2 t1.start() t2.start()