相关库
import pymysql import pymysql.cursors from bs4 import BeautifulSoup import requests import random import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import codecs from selenium.common.exceptions import TimeoutException
从数据库中读取车型(车型已经存放再数据库,这里读取车型的id,拼接到url上)
cars = []
conn = pymysql.connect(host='*******',charset='utf8',user=*******',passwd='*****',db='mysql',cursorclass=pymysql.cursors.DictCursor) try: cur = conn.cursor() cur.execute("USE data_etl") cur.execute("select distinct(car_id),car_name from user_car_port") item = cur.fetchone() count = 0 while item is not None: cars.append(item) count+=1 item = cur.fetchone() print(count) finally: conn.close()
由于汽车之家反爬比较复杂,我们直接调用浏览器接口
driver = webdriver.Chrome('chromedriver.exe')
def getCarPriceOffSale(innerHtml): button = 0.0 top = 0.0 print("此车型已经停售!") bsObj = BeautifulSoup(innerHtml) try: spanPrice = bsObj.findAll("span",{"class":"price"})[0] if spanPrice is not None: strongPrice = spanPrice.find("strong",{"class":"red"}) if strongPrice is not None: text = strongPrice.text if text is not None: prices = text.split("-") prices = text.split("-") prices[0] = prices[0].replace("万","") prices[0] = prices[0].replace("元","") button = float(prices[0]) if(len(prices) == 2): prices[1] = prices[1].replace("万","") prices[1] = prices[1].replace("元","") top = float(prices[1]) else: top = button else: print("价格字段为空") else: print("价格strong为空") else: print("价格span为空") except Exception: print("程序出错!停售车型") return button,top
处理在售车型的价格 信息
def getCarPriceOnSale(innerHtml): button = 0.0 top = 0.0 print("此车型在售") bsObj = BeautifulSoup(innerHtml) try: ddprice = bsObj.findAll("dd")[0] if ddprice is not None: a = ddprice.find("a",{"class":"emphasis"}) if a is not None: text = a.text prices = text.split("-") prices[0] = prices[0].replace("万","") prices[0] = prices[0].replace("元","") button = float(prices[0]) if(len(prices) == 2): prices[1] = prices[1].replace("万","") prices[1] = prices[1].replace("元","") top = float(prices[1]) else: top = button else: print("此车型暂时无法查询价格") except Exception: print("程序出错!在售车型") return button,top
处理停售车型的价格信息
def getCarPrice(carId): button = 0.0 top = 0.0 try: driver.get(url+str(carId)) wait = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"information-summary"))) ele = driver.find_element_by_class_name("information-price").get_attribute('innerHTML') button,top=getCarPriceOnSale(ele) except TimeoutException: try: wait = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"car_price"))) ele = driver.find_element_by_class_name("car_price").get_attribute('innerHTML') button,top=getCarPriceOffSale(ele) except TimeoutException: print("此车型有问题:"+str(carId)) return button,top
遍历数据库所有车型的id
for car in cars: id = car["car_id"] time.sleep(random.randint(1,5)) button,top = getCarPrice(id) if button == 0.0 and top == 0.0: car["button"] = 9999 car["top"] = 9999 else: car["button"] = button car["top"] = top