本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理
以下文章来源于腾讯云 作者:昱良
1、selenium爬虫,绕过企查查的登录验证,但账号和IP限制太大,最终放弃
2、通过requests直接请求+cookies,遇到了cookie有效期和限制问题
不断的尝试和修改参数,最终发现一种有效方式selenium + wep
只需要IP代理,不需要账号,没有限制,因为是没有登录,拿到的信息有限,能展示的都能获取。
image
一、初始化selenium
sysstr =<span> platform.system() </span><span>if</span>(sysstr ==<span>"</span><span>Windows</span><span>"</span><span>): chromedriver_path </span>= os.getcwd() + <span>"</span><span>utoolschromedriver.exe</span><span>"</span> <span>else</span>: <span>#</span><span>mac</span> chromedriver_path = os.getcwd() + <span>"</span><span>/mac_chromedriver</span><span>"</span><span> logger.info(</span><span>"</span><span>chromedriver_path: %s</span><span>"</span> %<span>(chromedriver_path,)) default_agent </span>= <span>"</span><span>--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"</span><span>"</span> <span>class</span><span> wap_QiChaCha(object): </span><span>def</span> <span>__init__</span>(self, user_agent_header=default_agent, chromedriver_path=<span>chromedriver_path): self.options </span>= webdriver.ChromeOptions() <span>#</span><span> 创建一个配置对象</span> self.options.add_argument(<span>"</span><span>--no-sandbox</span><span>"</span><span>) self.options.add_argument(</span><span>"</span><span>--disable-dev-shm-usage</span><span>"</span><span>) self.options.add_argument(</span><span>"</span><span>--proxy-server=http://47.98.154.206:3008</span><span>"</span><span>) self.options.add_argument(</span><span>"</span><span>--headless</span><span>"</span>) <span>#</span><span> 开启无界面模式cd</span> self.options.add_argument(<span>"</span><span>--disable-gpu</span><span>"</span>) <span>#</span><span> 可选项:禁用gpu,可以解决一些莫名的问题</span> <span> self.options.add_argument(user_agent_header) mobileEmulation </span>= {<span>"</span><span>deviceName</span><span>"</span>: <span>"</span><span>iPhone X</span><span>"</span><span>} self.options.add_experimental_option(</span><span>"</span><span>mobileEmulation</span><span>"</span><span>, mobileEmulation) </span><span>def</span><span> init(self): self.driver </span>= webdriver.Chrome(executable_path=chromedriver_path, chrome_options=<span>self.options) </span><span>#</span><span> 打开登录页面</span> self.driver.get(<span>"</span><span>https://m.qichacha.com/</span><span>"</span><span>) self.error_encounter </span>= 0
www#gaodaima.com来源gaodaimacom搞#^代%!码&网搞代码
二、判断公司存在
<span>def</span><span> search_company(self, company_name): </span><span>#</span><span>time.sleep(0.3)</span> <span>try</span><span>: result </span>=<span> {} result[COMPANY.NAME] </span>=<span> utils.normalizeCompanyName(company_name) logger.info(</span><span>"</span><span>search for: %s</span><span>"</span> %<span>(company_name,)) </span><span>"""</span><span> self.driver.get("https://m.qichacha.com/") self.driver.find_element_by_id("searchkey").send_keys(company_name) # 单击搜索按钮 srh_btn = self.driver.find_element_by_xpath("//*[@id="V3_Index_S"]//span") srh_btn.click() </span><span>"""</span><span> self.driver.get(</span><span>"</span><span>https://m.qcc.com/search?key=%s</span><span>"</span> %<span>(company_name)) utils.alertWait(WebDriverWait(self.driver, </span>3<span>).until, expected_conditions.presence_of_element_located( (By.XPATH, </span><span>"</span><span>//*[contains(@class,"text-danger") or contains(@class,"nodata")]</span><span>"</span>)), 5<span>, 0, </span><span>"</span><span>not found text-danger or nodata</span><span>"</span><span>) </span><span>#</span><span> 检测企业是不是存在</span> inc_full = self.driver.find_element_by_xpath(<span>"</span><span>//*[@class="text-danger"]</span><span>"</span><span>).text self.error_encounter </span>=<span> 0 </span><span>if</span> inc_full == <span>"</span><span>0</span><span>"</span><span>: logger.error(</span><span>"</span><span>company %s not found</span><span>"</span> %<span>(company_name,)) </span><span>return</span><span> None </span><span>#</span><span> 获取首个企业文本</span> cname = self.driver.find_element_by_xpath(<span>"</span><span>//div[@class="list-item-name"]</span><span>"</span><span>).text href </span>= self.driver.find_element_by_xpath(<span>"</span><span>//a[@class="a-decoration"]</span><span>"</span>).get_attribute(<span>"</span><span>href</span><span>"</span><span>) </span><span>#</span><span> 曾用名</span> cym =<span> None </span><span>try</span><span>: stock_or_others </span>= self.driver.find_element_by_xpath(<span>"</span><span>//div[@class="list-item-bottom"]</span><span>"</span><span>).text </span><span>#</span><span> print(stock_or_others)</span> <span>#</span><span> 称呼不同:曾用名 或 历史股东等</span> <span>if</span> utils.normalizeCompanyName(company_name) <span>in</span><span> stock_or_others: company_bottom </span>= stock_or_others.replace(<span>"</span><span>:</span><span>"</span>, <span>"</span><span>:</span><span>"</span><span>) cym </span>= company_bottom.split(<span>"</span><span>:</span><span>"</span>)[1<span>] </span><span>except</span><span>: </span><span>#</span><span> 获取下面显示失败</span> <span>pass</span> <span>if</span> utils.normalizeCompanyName(cname) ==<span> utils.normalizeCompanyName(company_name) </span><span>or</span> utils.normalizeCompanyName(cym) ==<span> utils.normalizeCompanyName(company_name): result[COMPANY.URL] </span>=<span> href </span><span>#</span><span> time.sleep(0.2)</span> <span>return</span><span> self.company_detail(href, result) </span><span>except</span><span> Exception as err: </span><span>#</span><span> self.driver.delete_all_cookies()</span> <span> logger.error(err) self.error_encounter </span>= self.error_encounter + 1 <span>if</span> self.error_encounter >= 3<span>: self.driver.quit() self.init() </span><span>return</span><span> None </span><span>finally</span><span>: </span><span>pass</span>
image
三、获取公司信息
<span>def</span><span> company_detail(self, href, result): self.driver.get(href) utils.alertWait(WebDriverWait(self.driver, </span>3<span>).until, expected_conditions.presence_of_element_located((By.XPATH, </span><span>"</span><span>//*[@class="company-name"]</span><span>"</span>)), 5<span>, 0, </span><span>"</span><span>not found text-danger</span><span>"</span><span>) </span><span>try</span><span>: phone </span>= self.driver.find_element_by_xpath(<span>"</span><span>/html/body/div[1]/div[2]/div[1]/div[3]/a[1]</span><span>"</span><span>).text </span><span>if</span> phone <span>and</span> len(phone) ><span> 0: result[COMPANY.TEL] </span>=<span> phone.strip() </span><span>except</span><span> Exception as e: </span><span>pass</span> <span>#</span><span> logger.info("没有手机号")</span> <span>try</span><span>: email </span>= self.driver.find_element_by_xpath(<span>"</span><span>/html/body/div[1]/div[2]/div[1]/div[3]/a[2]</span><span>"</span><span>).text </span><span>if</span> email <span>and</span> len(email) ><span> 0: result[COMPANY.EMAIL] </span>=<span> email.strip() </span><span>except</span><span> Exception as e: </span><span>pass</span> <span>#</span><span> logger.info("没有邮箱")</span> <span>try</span><span>: address </span>= self.driver.find_element_by_xpath(<span>"</span><span>/html/body/div[1]/div[2]/div[1]/div[4]</span><span>"</span><span>).text </span><span>if</span> address <span>and</span> len(address) ><span> 0: result[COMPANY.ADDRESS] </span>=<span> address.strip() </span><span>except</span><span> Exception as e: </span><span>pass</span> <span>#</span><span> logger.info("没有地址")</span> <span>try</span><span>: infos </span>= self.driver.find_element_by_xpath(<span>"</span><span>//div[@class="basic-wrap"]/table</span><span>"</span><span>) </span><span>#</span><span> infos = self.driver.find_element_by_xpath("//*[@id="Cominfo"]/table")</span> <span>except</span><span>: </span><span>return</span><span> result result[COMPANY.TAX_LEVEL] </span>= <span>"</span><span>税务等级&&</span><span>"</span> <span>try</span><span>: taxcreditlist </span>= self.driver.find_element_by_xpath(<span>"</span><span>//div[@id="taxcreditlist"]</span><span>"</span><span>).text info </span>= str(taxcreditlist).replace(<span>"</span><span> </span><span>"</span>, <span>"</span><span>&</span><span>"</span><span>).strip() result[COMPANY.TAX_LEVEL] </span>= result[COMPANY.TAX_LEVEL] +<span> info </span><span>except</span><span>: </span><span>return</span><span> result </span><span>#</span><span> 转为etree</span> data = etree.HTML(infos.get_property(<span>"</span><span>innerHTML</span><span>"</span><span>)) data_info </span>= data.xpath(<span>"</span><span>.//tr</span><span>"</span><span>) result[COMPANY.BUSINESS] </span>= <span>"</span><span>工商信息</span><span>"</span> <span>for</span> info <span>in</span><span> data_info: info_list </span>= info.xpath(<span>"</span><span>.//td//text()</span><span>"</span><span>) new_info_list </span>=<span> [] </span><span>for</span> info <span>in</span><span> list(info_list): new_info </span>= str(info).replace(<span>"</span><span> </span><span>"</span>, <span>""</span><span>).strip() new_info_list.append(new_info) new_info_list </span>= [i <span>for</span> i <span>in</span> new_info_list <span>if</span> i != <span>""</span><span>] self.retrieveInfo(new_info_list, result) result[COMPANY.BUSINESS] </span>= result[COMPANY.BUSINESS] + <span>"</span><span> && </span><span>"</span> + <span>"</span><span> && </span><span>"</span><span>.join( map(str, new_info_list)) </span><span>#</span><span> 以 && 分割 连接 list 内容</span> <span>return</span> result