9.6 笔记:scrapy爬取的数据存入MySQL,MongoDB

使用python:2.7.12

一、MongoDB

    一个小例子

1
2
1.spider:dmoz_item.py


from dmoz.items import DmozItem

class DmozItemSpider(scrapy.Spider):

    name = "dmoz_item"
    #allowed_domains = ["dmoz.org"]
    start_urls = ['http://www.dmoz.org/Computers/Programming/Languages/Python/Books/']

    def parse(self, response):
        list=response.xpath('/html/body/div[5]/div/section[3]/div/div/div/div[3]')
        for i in list:
            item=DmozItem()
            item['link']=i.xpath('a/@href').extract()
            item['title']=i.xpath('a/div/text()').extract()
            item['desc']=i.xpath('div/text()').extract()
            yield item

2.items: items.py

import scrapy

class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    desc=scrapy.Field()
    link=scrapy.Field()

主要的上菜:

3.setting:settings.py

ITEM_PIPELINES = {
   'dmoz.pipelines.DmozPipeline': 300,
}

MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'spider1'
MONGODB_DOCNAME = 'book_item'

4.最后是:pipelines pipelines.py

注意:这种方式下,是从scrapy.conf中import settings

from scrapy.conf import settings
import pymongo

class DmozPipeline(object):
    # def process_item(self, item, spider):
    #     return item
    def __init__(self):
        port = settings['MONGODB_PORT']
        host = settings['MONGODB_HOST']
        db_name = settings['MONGODB_DBNAME']
        client = pymongo.MongoClient(host=host, port=port)
        db = client[db_name]
        self.post = db[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):
         book_info = dict(item)
         self.post.insert(book_info)
         return item

MONGO 方法2 此测试成功

首先开启本地的MongoDB服务

sudo service mongodb start

在settings.py文件中添加数据库的配置项

MONGO_HOST = "127.0.0.1"  # 主机IP
MONGO_PORT = 27017  # 端口号
MONGO_DB = "Spider"  # 库名 
MONGO_COLL = "heartsong"  # collection名
# MONGO_USER = "zhangsan"
# MONGO_PSW = "123456"

然后编写Pipelines.py

# -*- coding: utf-8 -*-

import pymongo
from scrapy.conf import settings

class HeartsongPipeline(object): #改定义名和setting文件的time中的后半部分保持一致
    def __init__(self):
        # 链接数据库
        self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
        # 数据库登录需要帐号密码的话
        # self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW'])
        self.db = self.client[settings['MONGO_DB']]  # 获得数据库的句柄
        self.coll = self.db[settings['MONGO_COLL']]  # 获得collection的句柄

    def process_item(self, item, spider):
        postItem = dict(item)  # 把item转化成字典形式
        self.coll.insert(postItem)  # 向数据库插入一条记录
        return item  # 会在控制台输出原item数据,可以选择不写

这里要说明一点就是在使用MongoDB的时候不同于MySql,不用事先定义好数据表和表结构。我们用insert语句插入的时候,如果collection还不存在,他会自动被创建出来。

修改好之后就可以跑起来了,命令行输入我们早已熟悉的

scrapy crawl heartsong

二、mysql 一个小例子

1.spider: xicidaili.py

# -*- coding: utf-8 -*-
import scrapy
from xiciip.items import XiciipItem

class XicidailiSpider(scrapy.Spider):
    name = "xicidaili"
    allowed_domains = ["xicidaili.com"]
    #start_urls = ['http://zhangjiakou.ganji.com']

    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
    }

    def start_requests(self):
        reqs=[]

        for i in range(1,3):
            req=scrapy.Request("http://www.xicidaili.com/nn/%s"%i,headers=self.headers)
            reqs.append(req)

        return reqs


    def parse(self, response):
        print ("hahahahahhahah"+response.url)

        pre_item=XiciipItem()
        # pre_item['url']=response.url
        # return pre_item
        ip_list=response.xpath('//table[@id="ip_list"]')

        trs=ip_list[0].xpath('tr')

        items=[]
####string(td[4])   抽取字符串
        for i in trs[1:]:
            pre_item=XiciipItem()
            pre_item["ip"]=i.xpath('td[2]/text()')[0].extract()
            pre_item["port"]=i.xpath('td[3]/text()')[0].extract()
            pre_item["position"]=i.xpath('string(td[4])')[0].extract().strip()
            pre_item["type"]=i.xpath('td[6]/text()')[0].extract()

#####正则取    \. 表示. \d
            pre_item["speed"]=i.xpath('td[7]/div[@class="bar"]/@title').re('\d{0,}\.\d{0,}')[0]
            pre_item["last_check_time"]=i.xpath('td[9]/text()')[0].extract()
            items.append(pre_item)
        return items

2.item.py

import scrapy


class XiciipItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    ip=scrapy.Field()
    port=scrapy.Field()
    position=scrapy.Field()
    type=scrapy.Field()
    speed=scrapy.Field()
    last_check_time=scrapy.Field()

3.主菜上了 settings.py

MYSQL_HOSTS = '127.0.0.1'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '******'
#MYSQL_PORT = settings.MYSQL_PORT
MYSQL_DB='xiciip'
CHARSET='utf8'


ITEM_PIPELINES = {
   'xiciip.pipelines.XiciipPipeline': 300,
}

4.pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import MySQLdb

####注意
from scrapy.conf import settings


class XiciipPipeline(object):
    def process_item(self, item, spider):

        # DBKWARGS=spider.settings.get('DBKWARGS')
        # con=MySQLdb.connect(**DBKWARGS)


        host = settings['MYSQL_HOSTS']
        user = settings['MYSQL_USER']
        psd = settings['MYSQL_PASSWORD']
        db = settings['MYSQL_DB']
        c=settings['CHARSET']
#使用的方法2.
        con = MySQLdb.connect(host=host,user=user,passwd=psd,db=db,charset=c)
        #可以使用的方法1
        #con = MySQLdb.connect(host='127.0.0.1',user='root',passwd='******',db='xiciip',charset='utf8')
        cur=con.cursor()
        sql=("insert into proxy(ip,port,position,type,speed,last_check_time) "
             "values(%s,%s,%s,%s,%s,%s)")
    #    sql=('insert into p1(url) values("%s")')
        #sql="insert into p1 values (%s)"
        #list=(item['url'].split(':')[0])
        #list=[item['url']]
        #print('wwwwwwwwwwwwwwww',list,type(list),type('h'))
        list=[item['ip'],item['port'],item['position'],item['type'],item['speed'],item['last_check_time']]

        try:
            cur.execute(sql,list)
        except Exception,e:
            print('Insert error',e)
            con.rollback()

        else:
            con.commit()

        cur.close()
        con.close()

        return item
    原文作者:怂恿的大脑
    原文地址: https://www.jianshu.com/p/3d461f9d0903
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞