python3 爬去公司内部的ppt资料

2019年7月25日 234次阅读

在写爬虫的过程中遇到如下错误：

WinError 10061 - No Connection Could be made

解决方法：

 1. 打开IE internet options
 2. Connections -> Lan Setting
 3. 勾上automatically detect settings

封装好的db操作

# -*- coding:utf-8 -*-
#__author__ = 'ecaoyng'

import pymysql
import time

class DBOperation:

    def __init__(self, tb_name):
        self.db_host = 'x'
        self.db_port = 3306
        self.db_user = 'x'
        self.db_pwd = 'x'
        self.db_name = 'x'
        self.tb_name = tb_name

    def get_time(self):
        now_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        return now_time
    ''' set up connection with db '''
    def db_conn(self):
        exec_time = self.get_time()
        try:
            conn = pymysql.connect(host=self.db_host,port=self.db_port,
                                   user=self.db_user,passwd=self.db_pwd,db=self.db_name)
            return conn
        except Exception as e:
            print((u'[%s]: Errors during db connection:%s' % (exec_time, e)))
            return None
    ''' set up cursor '''
    def db_cursor(self, conn):
        try:
            cur = conn.cursor()
            return cur
        except Exception as e:
            print(e)
            return None

    ''' db close '''
    def db_close(self,cur,conn):
        exec_time = self.get_time()
        cur.close()
        conn.close()
        print(u'[%s]: db closed' % exec_time)



    ''' db operations '''
    def tb_insert_url(self,cur,conn,urls):
        exec_time = self.get_time()
        tb_exist_sql = """CREATE TABLE IF NOT EXISTS """+ self.tb_name + """ ( URL VARCHAR(200) NOT NULL )"""
        try:
            cur.execute(tb_exist_sql)
            print(u'[%s]: try to create table %s if not exists.' % (exec_time, self.tb_name))
            conn.commit()

            sql_insert_url = 'INSERT INTO ' + self.tb_name +' VALUES (%s)'
            cur.executemany(sql_insert_url,urls)
            conn.commit()
        except Exception as e:
            print(u'[%s]: Errors during insert into %s:%s' % (exec_time, self.tb_name ,e))


if __name__ == '__main__':

    db=DBOperation('ECNSlides')
    db_conn = db.db_conn()
    db_cur = db.db_cursor(db_conn)
    db.db_close(db_cur,db_conn)

下面是爬虫程序

# -*- coding:utf-8 -*-
#__author__ = 'ecaoyng'

from ESlides.src.DBOperation import *
import urllib.request
import re
import time


class ESlidesCrawler:
    def __init__(self):
        self.target_link='https://mediabank.ericsson.net/search/slides/group%20function%20%28gf%29'
        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
        self.user_headers = {
            'User-Agent': self.user_agent,
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept - Encoding' : 'gzip, deflate, br',
            'Accept-Language' : 'zh-CN,zh;q=0.8',
            'Cookie' : 'PHPSESSID=57i0onm69eei46g6g23ek05tj2',
            'Host' : 'mediabank.ericsson.net',
            'Referer' : 'https://mediabank.ericsson.net/'

        }
        self.save_dir = 'C:/Users/ecaoyng/Desktop/PPT/'

    ''' get local time '''
    def get_time(self):
        now_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        return now_time
    ''' get page links '''
    def get_page(self):
        now_time=self.get_time()
        try:
            request = urllib.request.Request(self.target_link, headers=self.user_headers)
            response = urllib.request.urlopen(request)
            pageCode = response.read().decode('utf-8')
            return  pageCode
        except urllib.request.URLError as e:
            print(u'%s Errors during connect to target link:%s' % (now_time, e))
            return None
    ''' get initial target links '''
    def get_links(self):
        now_time = self.get_time()
        page_code = self.get_page()
        if page_code is not None:
            page_links = []
            try:
                pattern = re.compile(
                    '<li id=.*?>.*?<a href="/media/(.*?)" class="thumb" draggable="true">',re.S)
                items = re.findall(pattern, page_code)
                for item in items:
                    item = '%s%s%s' % ('https://mediabank.ericsson.net/details/', item, '/download/original')
                    page_links.append(item)
                return page_links
            except Exception as e:
                print(u'[%s]: Errors during parser target link:%s' % (now_time, e))
                return None
        else:
            print('page code returns none')
            return None
    ''' save links into database '''
    def save_links(self):
        now_time = self.get_time()
        links=self.get_links()
        print(links)
        try:
            if links is not None:
                db = DBOperation('ECNSlides')
                db_conn = db.db_conn()
                db_cur = db.db_cursor(db_conn)
                print(u'[%s]: start to urls insert to db' % now_time)
                db.tb_insert_url(db_cur, db_conn, links)
                print(u'[%s]: write urls insert to db successfully' % now_time)
            else:
                print(u'[%s]: URL is None when insert to db' % now_time)
                pass
        finally:
            db.db_close(db_cur, db_conn)

    ''' download ECN slides with params by http '''
    def slides_download_params(self):

        links = self.get_links()
        try:
            for url in links:
                now_time = self.get_time()
                file_pattern = re.compile(
                    '.*?/(\d+)/download/original$',re.S)
                file_name = re.findall(file_pattern, url)
                file_path = self.save_dir + ''.join(file_name) + '.pptx'

                print('Downloading to %s ...' % file_path)

                save_file = open(file_path,'wb')
                save_file.write(urllib.request.urlopen(url).read())
                save_file.close()


                # with urllib.request.urlopen(url) as slide:
                # with open(file_path, 'wb') as outfile:
                # outfile.write(slide.read())
                #
                # break
        except Exception as e:
            print(u'[%s]: Errors during download slides: %s.' % (now_time,e))






    ''' download ECN slides with remote db '''
    def slides_download_db(self):
        pass













if __name__ == '__main__':
    crawler=ESlidesCrawler()
    # crawler.save_links()
    crawler.slides_download_params()

问题出现了，发现在http中敲入下载地址，类似于

https://mediabank.ericsson.net/details/Organization%20simple/83138/download/original

但是python代码中用这个地址返回的不是pptx文件，而是html文件.
要知道具体返回的是什么文件的方法如下：

# reobj=urllib.request.urlopen(url)
# print(type(reobj))
# print(reobj.info())
# print(reobj.getcode())

可以看到正常如果下载的是zip文件，则返回的信息如下：

Content-Type: application/x-zip-compressed
Last-Modified: Mon, 23 May 2016 07:50:56 GMT
Accept-Ranges: bytes
ETag: "0f075d6c7b4d11:0"
Server: Microsoft-IIS/7.5
X-Powered-By: ASP.NET
Date: Wed, 29 Nov 2017 07:07:27 GMT
Connection: close
Content-Length: 55712699

但是本来是ppt文件，却下载了

Cache-Control: no-cache
Pragma: no-cache
Content-Length: 11743
Content-Type: text/html
Expires: Wed, 29 Nov 2017 07:04:04 GMT
Server: Microsoft-IIS/8.0
Set-Cookie: SMTargetSession=HTTPS%3A%2F%2Ffss%2Eericsson%2Ecom%2Fsiteminderagent%2Fredirectjsp%2Fredirect%2Dinternal%2Ejsp%3FSPID%3DMediabankIntern%26RelayState%3Dhttps%253A%252F%252Fmediabank%2Eericsson%2Enet%252Fdetails%252FOrganization%252520simple%252F83138%252Fdownload%252Foriginal%26SMPORTALURL%3Dhttps%253A%252F%252Ffss%2Eericsson%2Ecom%252Faffwebservices%252Fpublic%252Fsaml2sso%26SAMLTRANSACTIONID%3D176beb36%2Dfeb953b6%2D9a53d42e%2D58810506%2D087b72ac%2Da4e3; path=/ Set-Cookie: ASPSESSIONIDACATSTTS=FOLBNEGCIBMFCPILNEMHOHFN; path=/
X-Powered-By: ASP.NET
X-WAM-LOC: LP2-2
Date: Wed, 29 Nov 2017 07:05:04 GMT
Connection: close
Set-Cookie: BIGipServerWAM_PRD_Login=rd423o00000000000000000000ffff9958f466o50001; path=/

Content-Type: text/html 说明是html文件。将其打开之后发现是公司的安全认证页面.

于是开始思索是否可以用cookie的方式来抓取.
(未完待续)