使用python:2.7.12
一、MongoDB
一个小例子
1
2
1.spider:dmoz_item.py
from dmoz.items import DmozItem
class DmozItemSpider(scrapy.Spider):
name = "dmoz_item"
#allowed_domains = ["dmoz.org"]
start_urls = ['http://www.dmoz.org/Computers/Programming/Languages/Python/Books/']
def parse(self, response):
list=response.xpath('/html/body/div[5]/div/section[3]/div/div/div/div[3]')
for i in list:
item=DmozItem()
item['link']=i.xpath('a/@href').extract()
item['title']=i.xpath('a/div/text()').extract()
item['desc']=i.xpath('div/text()').extract()
yield item
2.items: items.py
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
desc=scrapy.Field()
link=scrapy.Field()
主要的上菜:
3.setting:settings.py
ITEM_PIPELINES = {
'dmoz.pipelines.DmozPipeline': 300,
}
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'spider1'
MONGODB_DOCNAME = 'book_item'
4.最后是:pipelines pipelines.py
注意:这种方式下,是从scrapy.conf中import settings
from scrapy.conf import settings
import pymongo
class DmozPipeline(object):
# def process_item(self, item, spider):
# return item
def __init__(self):
port = settings['MONGODB_PORT']
host = settings['MONGODB_HOST']
db_name = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host, port=port)
db = client[db_name]
self.post = db[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
book_info = dict(item)
self.post.insert(book_info)
return item
MONGO 方法2 此测试成功
首先开启本地的MongoDB服务
sudo service mongodb start
在settings.py文件中添加数据库的配置项
MONGO_HOST = "127.0.0.1" # 主机IP
MONGO_PORT = 27017 # 端口号
MONGO_DB = "Spider" # 库名
MONGO_COLL = "heartsong" # collection名
# MONGO_USER = "zhangsan"
# MONGO_PSW = "123456"
然后编写Pipelines.py
# -*- coding: utf-8 -*-
import pymongo
from scrapy.conf import settings
class HeartsongPipeline(object): #改定义名和setting文件的time中的后半部分保持一致
def __init__(self):
# 链接数据库
self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
# 数据库登录需要帐号密码的话
# self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW'])
self.db = self.client[settings['MONGO_DB']] # 获得数据库的句柄
self.coll = self.db[settings['MONGO_COLL']] # 获得collection的句柄
def process_item(self, item, spider):
postItem = dict(item) # 把item转化成字典形式
self.coll.insert(postItem) # 向数据库插入一条记录
return item # 会在控制台输出原item数据,可以选择不写
这里要说明一点就是在使用MongoDB的时候不同于MySql,不用事先定义好数据表和表结构。我们用insert语句插入的时候,如果collection还不存在,他会自动被创建出来。
修改好之后就可以跑起来了,命令行输入我们早已熟悉的
scrapy crawl heartsong
二、mysql 一个小例子
1.spider: xicidaili.py
# -*- coding: utf-8 -*-
import scrapy
from xiciip.items import XiciipItem
class XicidailiSpider(scrapy.Spider):
name = "xicidaili"
allowed_domains = ["xicidaili.com"]
#start_urls = ['http://zhangjiakou.ganji.com']
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
}
def start_requests(self):
reqs=[]
for i in range(1,3):
req=scrapy.Request("http://www.xicidaili.com/nn/%s"%i,headers=self.headers)
reqs.append(req)
return reqs
def parse(self, response):
print ("hahahahahhahah"+response.url)
pre_item=XiciipItem()
# pre_item['url']=response.url
# return pre_item
ip_list=response.xpath('//table[@id="ip_list"]')
trs=ip_list[0].xpath('tr')
items=[]
####string(td[4]) 抽取字符串
for i in trs[1:]:
pre_item=XiciipItem()
pre_item["ip"]=i.xpath('td[2]/text()')[0].extract()
pre_item["port"]=i.xpath('td[3]/text()')[0].extract()
pre_item["position"]=i.xpath('string(td[4])')[0].extract().strip()
pre_item["type"]=i.xpath('td[6]/text()')[0].extract()
#####正则取 \. 表示. \d
pre_item["speed"]=i.xpath('td[7]/div[@class="bar"]/@title').re('\d{0,}\.\d{0,}')[0]
pre_item["last_check_time"]=i.xpath('td[9]/text()')[0].extract()
items.append(pre_item)
return items
2.item.py
import scrapy
class XiciipItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
ip=scrapy.Field()
port=scrapy.Field()
position=scrapy.Field()
type=scrapy.Field()
speed=scrapy.Field()
last_check_time=scrapy.Field()
3.主菜上了 settings.py
MYSQL_HOSTS = '127.0.0.1'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '******'
#MYSQL_PORT = settings.MYSQL_PORT
MYSQL_DB='xiciip'
CHARSET='utf8'
ITEM_PIPELINES = {
'xiciip.pipelines.XiciipPipeline': 300,
}
4.pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import MySQLdb
####注意
from scrapy.conf import settings
class XiciipPipeline(object):
def process_item(self, item, spider):
# DBKWARGS=spider.settings.get('DBKWARGS')
# con=MySQLdb.connect(**DBKWARGS)
host = settings['MYSQL_HOSTS']
user = settings['MYSQL_USER']
psd = settings['MYSQL_PASSWORD']
db = settings['MYSQL_DB']
c=settings['CHARSET']
#使用的方法2.
con = MySQLdb.connect(host=host,user=user,passwd=psd,db=db,charset=c)
#可以使用的方法1
#con = MySQLdb.connect(host='127.0.0.1',user='root',passwd='******',db='xiciip',charset='utf8')
cur=con.cursor()
sql=("insert into proxy(ip,port,position,type,speed,last_check_time) "
"values(%s,%s,%s,%s,%s,%s)")
# sql=('insert into p1(url) values("%s")')
#sql="insert into p1 values (%s)"
#list=(item['url'].split(':')[0])
#list=[item['url']]
#print('wwwwwwwwwwwwwwww',list,type(list),type('h'))
list=[item['ip'],item['port'],item['position'],item['type'],item['speed'],item['last_check_time']]
try:
cur.execute(sql,list)
except Exception,e:
print('Insert error',e)
con.rollback()
else:
con.commit()
cur.close()
con.close()
return item