爬取腾讯招聘,scrapy项目
items配置
import scrapy
class TencentItem(scrapy.Item):
positionName = scrapy.Field()
spider配置
# -*- coding: utf-8 -*-
import scrapy
from TenCent.items import TencentItem
class TencentSpider(scrapy.Spider):
name = 'tencent'
allowed_domains = ['tencent.com']
#start_urls = ['http://tencent.com/']
baseURL = "https://hr.tencent.com/position.php?&start="
offset = 0
start_urls = [baseURL + str(offset)]
def parse(self, response):
node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
for node in node_list:
item = TencentItem()
item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0]
yield item
if self.offset < 30:
self.offset += 10
url = self.baseURL + str(self.offset)
yield scrapy.Request(url,callback=self.parse)
settings配置
BOT_NAME = 'TenCent'
SPIDER_MODULES = ['TenCent.spiders']
NEWSPIDER_MODULE = 'TenCent.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'TenCent.pipelines.TencentPipeline': 300,
}
pipelines配置
import json
class TencentPipeline(object):
def _init_(self):
self.file = codecs.open("tencent.json","w")
def process_item(self, item, spider):
content = json.dumps(dict(item), ensure_ascii=False)
self.file.write(content.encode("utf-8")) + ",\n"
return item
def close_spider(self):
self.file.close()
蛋疼的报错
2018-05-10 09:11:12 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: TenCent)
2018-05-10 09:11:12 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.7, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.1.0, Python 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 10:22:32) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 17.5.0 (OpenSSL 1.0.2n 7 Dec 2017), cryptography 2.1.4, Platform Windows-7-6.1.7601-SP1
2018-05-10 09:11:12 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'TenCent', 'NEWSPIDER_MODULE': 'TenCent.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['TenCent.spiders']}
2018-05-10 09:11:12 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2018-05-10 09:11:13 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-05-10 09:11:13 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-05-10 09:11:13 [scrapy.middleware] INFO: Enabled item pipelines:
['TenCent.pipelines.TencentPipeline']
2018-05-10 09:11:13 [scrapy.core.engine] INFO: Spider opened
2018-05-10 09:11:13 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-05-10 09:11:13 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-05-10 09:11:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://hr.tencent.com/robots.txt> (referer: None)
2018-05-10 09:11:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://hr.tencent.com/position.php?&start=0> (referer: None)
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说游戏资深运营经理(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类资深项目管理(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类内容IP版权管理(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'SNG04-广告业务后台工程师(上海)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'SNG04-UGC质量后台开发工程师(上海)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '25667-运营商渠道销售(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'MIG08-后台开发高级工程师(广州)(腾讯WiFi管家)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'MIG16-车联网大数据及算法产品经理'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类游戏商业化运营(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类美术编辑(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://hr.tencent.com/position.php?&start=10> (referer: https://hr.tencent.com/position.php?&start=0)
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类内容编辑(深圳/北京)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类内容编辑(深圳/北京)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类资深内容责编(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'SNG08-高级品牌视觉设计师(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'TEG09-推荐系统后台开发工程师(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'WXG06-321 微信境外支付高级区域经理(日本)微信境外支付高级区域经理(台北)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类游戏策划(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类平台高级产品经理(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '21087-互动小说类游戏版本管理(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'WXG06-321 微信境外支付高级区域经理(香港)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://hr.tencent.com/position.php?&start=20> (referer: https://hr.tencent.com/position.php?&start=10)
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'WXG06-321 微信境外支付高级区域经理(日本)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'WXG06-321 微信境外支付高级区域经理(日本)微信境外支付高级区域经理(泰国)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '16810-动漫商务经理(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '16810-动漫付费业务编辑(北京)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'WXG06-321 微信境外支付高级区域经理(日本)微信境外支付高级区域经理(荷兰))'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '15612-手游关卡策划(北京)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '15575-《圣斗士星矢》资深系统策划(成都)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'MIG06-智能硬件产品经理(人机交互方向)(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': '20772-企鹅影视天机工作室制片人'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'MIG03-移动端测试开发工程师(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://hr.tencent.com/position.php?&start=30> (referer: https://hr.tencent.com/position.php?&start=20)
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': '25923-互动娱乐游戏数据库管理(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': '27087-腾讯开放平台部投资经理(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': '15605-特效设计(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': '20589-海外PM(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': '15575-王者荣耀游戏社区产品经理(成都)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': '23673-数码编辑(北京)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'S2-CDG财务管理(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': 'SNG16-腾讯音乐用户研究工程师(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': '27087-腾讯开放平台大数据投资系统前端开发(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.scraper] ERROR: Error processing {'positionName': '27087-投资孵化中心大数据分析师(深圳)'}
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\PY\TenCent\TenCent\pipelines.py", line 17, in process_item
self.file.write(content.encode("utf-8")) + ",\n"
AttributeError: 'TencentPipeline' object has no attribute 'file'
2018-05-10 09:11:15 [scrapy.core.engine] INFO: Closing spider (finished)
2018-05-10 09:11:15 [scrapy.core.engine] ERROR: Scraper close failure
Traceback (most recent call last):
File "F:\Anaconda\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
TypeError: close_spider() takes 1 positional argument but 2 were given
2018-05-10 09:11:15 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1467,
'downloader/request_count': 5,
'downloader/request_method_count/GET': 5,
'downloader/response_bytes': 16172,
'downloader/response_count': 5,
'downloader/response_status_count/200': 5,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 5, 10, 1, 11, 15, 103335),
'log_count/DEBUG': 6,
'log_count/ERROR': 41,
'log_count/INFO': 7,
'request_depth_max': 3,
'response_received_count': 5,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'start_time': datetime.datetime(2018, 5, 10, 1, 11, 13, 535246)}
2018-05-10 09:11:15 [scrapy.core.engine] INFO: Spider closed (finished)
最后修改w+ 为 wb 就成功了具体原理 尚不大懂
pipelines修改为
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import codecs
class TencentPipeline(object):
# def _init_(self):
# self.file = codecs.open("tencent.json","w+",encoding='utf-8')
#
# def process_item(self, item, spider):
# lines = json.dumps(dict(item), ensure_ascii=False) + '\n'
# self.file.write(lines)
# self.file.flush()
# return item
#
#
# def spider_closed(self, spider):
# self.file.close()
def open_spider(self, spider):
self.file = codecs.open('items.json', 'wb',encoding='utf-8')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item