之前用selenium和phantomJS单线程爬取tyc的对外投资信息,无奈爬取速度太慢,单个企业抓取速度大概在>30-60s,这还不是最关键的,最令人崩溃的是刚抓取一会就有bug,导致程序中断,程序中断的原因大概在爬取程序卡在某个部分不动了,经检查也没发现bug在哪,所以爬虫一直处于手动爬虫-手动中断-继续爬虫的状态。今天学了scrapy,果断用scrapy+selenium+phantomJS来爬。
先上代码
#coding:utf-8
from selenium.webdriver.common.keys import Keys
import time
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymongo
import xlrd
import time
import scrapy
from tyc.items import TycItem
import logging
from scrapy.http import Request
class TycSpider(scrapy.Spider):
name = 'tyc'
allowed_domains = ['tianyancha.com']
fname = "C:\\Users\\Administrator\\Desktop\\test.xlsx"
workbook = xlrd.open_workbook(fname)
sheet = workbook.sheet_by_name('Sheet1')
urls = list()
cols = sheet.col_values(0)
#要爬取的url
start_urls =['http://www.tianyancha.com/search?key={}&checkFrom=searchBox' .format(col) for col in cols]
def parse(self,response):
#用phantomJs模拟浏览器,添加headers
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36"
)
browser = webdriver.PhantomJS(desired_capabilities=dcap)
browser.get(response.url)
time.sleep(4)
#获取企业url
try:
url = browser.find_element_by_class_name('query_name').get_attribute('href')
browser.quit()
self.logger.info('成功搜索到 %s',url)
yield Request(url = url,callback = self.parse_detail)
except Exception as e:
self.logger.info('经查询没有这个企业!')
def parse_detail(self,response):
#获取企业对外投资情况
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36"
)
browser = webdriver.PhantomJS(desired_capabilities=dcap)
browser.get(response.url)
self.logger.info('url %s', response.url)
time.sleep(3)
soup = BeautifulSoup(browser.page_source, 'lxml')
# driver.implicitly_wait(10)
browser.quit()
item = TycItem()
name = soup.select('.base-company')[0].text.split(' ')[0]
self.logger.info('企业名 %s',name)
try:
inv = soup.select('#nav-main-outInvestment .m-plele')
print (len(inv))
for i in inv:
inv = i.select('div')
companyName = inv[0].text
legalPerson = inv[2].text
industry = inv[3].text
state = inv[4].text
invest = inv[5].text
item['company'] = name
item['enterprise_name'] = companyName
item['legal_person_name'] = legalPerson
item['industry'] = industry
item['status'] = state
item['reg_captial'] = invest
yield (item)
except Exception as e:
self.logger.info('这个企业没有对外投资!')
有几处需要注意:
- 虽然用selenium模拟浏览器了,但是仍然要添加headers,不添加headers,网页的代码还是不全。
- 现在速度是有些提升了,不过面对海量的数据,还是要利用分布式爬虫scrapy-redis或者scrapyd。