Python爬虫实战一：爬取csdn学院所有课程名、价格和课时

2019年6月16日 358次阅读来源: yihan.z

作为第一个学习的爬虫小程序，选取csdn学院，主要是该网站无反爬或较简单，不需要模拟浏览器和代理IP，也不需要验证和登录信息，对于新手而言，是非常亲民的；其次，需要爬取的内容都能在网页源码中显示。本篇文章使用urllib和正则表达式进行爬取。步骤一：分析网站，建议使用能查看网页源码的浏览器分析网站，找到所有课程，价格和课时；课程名所在位置：

<img src="https://img-bss.csdn.net/201708171721537407.gif" width="179" height="120" alt="让机器“看见”—计算机视觉原理及实战">

价格所在位置：

 <p class="clearfix">
                        <i>
                            ￥269.10                        </i>

课时所在位置：

<p><em>82</em>课时（<em>已更新至82</em>）<em>13</em>小时<em>07</em>分 </p>

步骤二：

编写正则表达式，将网页中课程名、价格和课时提取出来。

pat1 = '<img src="(.*?)" width="179" height="120" alt="(.*?)">'
name = re.compile(pat1).findall(str(data))
name = dict(name).values() #将数据转为字典，并将value提取出来
pat2 = '<p><em>(.*?)</em>'
class_num = re.compile(pat2).findall(str(data))
# data = data.replace('\n','').replace('\t','') #将网页的换行符替换掉
pat3 = '<p class="clearfix">\s{1,}<i>\s{1,}(.*?)\s{1,}'
price = re.compile(pat3).findall(str(data))
for i in price:
    price[price.index(i)] = re.findall(r'-?\d+\.?\d*e?-?\d*?', i)

步骤三：编写代码实现网页翻页。主要通过点击页面的页数，进行url的构造，本文url构造如下： https://edu.csdn.net/courses/p+? 观察得到，总课程为299页，编写一个循环语句，循环一次，遍历网页一次，代码如下：

html = "https://edu.csdn.net/courses"
for n in range(1,299):
    url = html+'/p'+str(n)
    print(url)
    data = urllib.request.urlopen(url).read().decode('utf-8') #请求网页，设置编码方式为utf-8

步骤四：对于爬取下来的数据一般存取到数据库和本地文档中，本文章把爬取的数据存成CSV文档保存到本地。代码如下：

#将爬取数据存取为csv文件，保存在本地

def sava_data(name,class_num,price):
    #创建workbook和sheet对象
    workbook = xlwt.Workbook()
    sheet1 = workbook.add_sheet('sheet1',cell_overwrite_ok=True)

    #初始化excel样式
    style = xlwt.XFStyle()

    #为样式创建字体
    font = xlwt.Font()
    font.name = 'Times New Roman'
    font.bold = True

    #设置样式的字体
    style.font = font

    #在sheet1表的第1行设置字段名称并写入数据
    sheet1.write(0,0,"序号",style)
    sheet1.write(0,1,"课程名",style)
    sheet1.write(0,2,"课时",style)
    sheet1.write(0,3,"价格",style)

    a=0                                                                #定义行号初始值
    for i in xx:
        #print(str(a+1),i[0])
        sheet1.write(a+1,0,a+1,style)                                   #在第a+1行第1列写入序号
        sheet1.write(a+1,1,name,style)                                  #在第a+1行第2列写入课程名
        sheet1.write(a+1,2,class_num,style)                                  #在第a+1行第3列写入课时
        sheet1.write(a+1,3,price,style)                             #在第a+1行第4列写入课程价格
        a+=1

        if a==a:                                                        #判断XX列表是否遍历结束
            t=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            t1=datetime.datetime.now().strftime("%Y%m%d%H%M%S")
            sheet1.write(a+2,1,"采集时间",style)                        #在sheet1表尾行写入数据采集时间
            sheet1.write(a+2,2,t,style)

    workbook.save("E:/csdn学院课程汇总表"+str(t1)+".xls")                 #保存该excel文件,有同名文件时无法直接覆盖

    print("数据写入excel文件完毕！")

完整代码：

import urllib.request 
import re,xlwt,datetime

class csdn_spider():
    def __init__(self):
        self.c = 0
    def sava_data(self,name,class_num,price):
        #创建workbook和sheet对象
        workbook = xlwt.Workbook()
        sheet1 = workbook.add_sheet('sheet1',cell_overwrite_ok=True)

        #初始化excel样式
        style = xlwt.XFStyle()

        #为样式创建字体
        font = xlwt.Font()
        font.name = 'Times New Roman'
        font.bold = True

        #设置样式的字体
        style.font = font

        #在sheet1表的第1行设置字段名称并写入数据
        sheet1.write(0,0,"序号",style)
        sheet1.write(0,1,"课程名",style)
        sheet1.write(0,2,"课时",style)
        sheet1.write(0,3,"价格",style)

        a=0                                                                #定义行号初始值
        for i in range(0,self.c-1):
            #print(str(a+1),i[0])
            sheet1.write(a+1,0,a+1,style)                                   #在第a+1行第1列写入序号
            sheet1.write(a+1,1,name[i],style)                                  #在第a+1行第2列写入课程名
            sheet1.write(a+1,2,class_num[i],style)                                  #在第a+1行第3列写入课时
            sheet1.write(a+1,3,price[i],style)                             #在第a+1行第4列写入课程价格
            a+=1

            if a==a:                                                        #判断XX列表是否遍历结束
                t=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                t1=datetime.datetime.now().strftime("%Y%m%d%H%M%S")
                sheet1.write(a+2,1,"采集时间",style)                        #在sheet1表尾行写入数据采集时间
                sheet1.write(a+2,2,t,style)

        workbook.save("E:/csdn学院课程汇总表"+str(t1)+".xls")                 #保存该excel文件,有同名文件时无法直接覆盖

        print("数据写入excel文件完毕！")
    def data(self):
        html = "https://edu.csdn.net/courses"
        name = []
        class_num = []
        price = []
        for n in range(1,299):
            url = html+'/p'+str(n)
            print(url)
            data = urllib.request.urlopen(url).read().decode('utf-8') #请求网页，设置编码方式为utf-8
            #print(data)
            pat1 = '<img src="(.*?)" width="179" height="120" alt="(.*?)">'
            n = re.compile(pat1).findall(str(data))
            n = list(dict(n).values()) #将数据转为字典，并将value提取出来
            name = name+n
            pat2 = '<p><em>(.*?)</em>'
            class_num += re.compile(pat2).findall(str(data))
            # data = data.replace('\n','').replace('\t','') #将网页的换行符替换掉
            pat3 = '<p class="clearfix">\s{1,}<i>\s{1,}(.*?)\s{1,}'
            p = re.compile(pat3).findall(str(data))
            for i in p:
                p[p.index(i)] = re.findall(r'-?\d+\.?\d*e?-?\d*?', i)
            price = price+p
            print(name,class_num,price)
        self.c = len(class_num)
        print(self.c,list(name),class_num,price)    
        self.sava_data(list(name),class_num,price)
if __name__ == '__main__':
    saveinfo = csdn_spider() #调用类
    save_res = saveinfo.data()

老实讲，用urllib+正则运行爬虫确实比scrapy繁琐，且运行速度也较慢，该代码可以改进或多线程来提高运行速度，但具体怎样小编就不一一介绍了。这是小编的第一篇文章，语文表达能力有限，希望能帮到你们！

    原文作者：yihan.z
    原文地址: https://blog.csdn.net/qq_33361618/article/details/80766395
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。