python scrapy 腾讯社会招聘爬虫摘要

2023年11月4日 269次阅读来源: IM余安

一 . 编写scrapy爬虫

创建项目：D:\scrapy>scrapy startproject Tencent
D:\scrapy> cd Tentcent
创建爬虫：D:\scrapy\Tentcent>scrapy genspider tencent hr.tencent.com

#　腾讯社招 https://hr.tencent.com/position.php?&start=0#a

职位名 positionName

职位详情连接　positionLink

职位类别  positionType

招聘人数  peopleNumber

工作地点  workLocation

发布时间  publishTime

scrapy启动，会先读取配置文件 setting.py

编写items.py，明确需要提取的数据。
编写spiders/xxxx.py爬虫文件，处理请求和响应，以及提取数据（yield item）。
编写pipelines.py，编写管道文件，处理spider返回item数据。
编写setting.py, 启动管道文件，以及其他相关设置
执行爬虫，调试等。

二、配置mysql数据库

1. 安装pymysql

cmd下：pip install pymysql
然后python >>> import pymysql 检查是否安装完成，安装成功！

2. 查看items.py

class TencentItem(scrapy.Item):
    # name = scrapy.Field()
    # 职位名
    positionName = scrapy.Field()
    # 职位详情连接　
    positionLink = scrapy.Field()
    # 职位类别
    positionType = scrapy.Field()
    # 招聘人数
    peopleNumber = scrapy.Field()
    # 工作地点
    workLocation = scrapy.Field()
    # 发布时间
    publishTime = scrapy.Field()

3. 创建数据库和表

mysql> create database tencent DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;
mysql> use tencent;

CREATE TABLE `hr` (
`id`  int(10) NOT NULL ,
`positionName`  varchar(100) NOT NULL COMMENT '职位名' ,
`positionLink`  varchar(150) NULL DEFAULT  COMMENT '职位详情连接' ,
`positionType`  varchar(30) NULL COMMENT '职位类别' ,
`peopleNumber`  int(10) NULL COMMENT '招聘人数' ,
`workLocation`  varchar(30) NULL COMMENT '工作地点' ,
`publishTime`  timestamp NULL ON UPDATE CURRENT_TIMESTAMP COMMENT '发布时间' ,
PRIMARY KEY (`id`)
);

4. setting.py

ITEM_PIPELINES = {
   #'Tencent.pipelines.TencentPipeline': 300,
   'Tencent.pipelines.TencentMysqlDBPipeline': 200,
}

#Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'tencent'
MYSQL_USER = 'root'
MYSQL_PASSWD = ''
MYSQL_PORT = 3306

5. pipelines.py

# import json
import pymysql
# from scrapy.conf import settings
from scrapy import log
from twisted.enterprise import adbapi


class TencentMysqlDBPipeline(object):
    @classmethod
    def from_settings(cls, settings):
        dbargs = dict(
            host=settings['MYSQL_HOST'],
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            port=settings['MYSQL_PORT'],
            charset='utf8',
            cursorclass=pymysql.cursors.DictCursor,
            use_unicode=True,
        )
        dbpool = adbapi.ConnectionPool('pymysql', **dbargs)
        return cls(dbpool)


    def __init__(self,dbpool):
        self.dbpool=dbpool

    #pipeline默认调用
    def process_item(self, item, spider):
        d=self.dbpool.runInteraction(self._conditional_insert, item, spider) #调用插入的方法
        log.msg("-------------------连接好了-------------------")
        d.addErrback(self._handle_error,item,spider) #调用异常处理方法
        d.addBoth(lambda _: item)
        return d

    def _conditional_insert(self, conn, item, spider):
        log.msg("-------------------打印-------------------")

        conn.execute("insert into hr (positionName, positionLink, positionType, peopleNumber, workLocation, publishTime) values(%s, %s, %s, %s, %s, %s)",
                     (item['positionName'], item['positionLink'], item['positionType'], item['peopleNumber'], item['workLocation'], item['publishTime']))
        log.msg("-------------------一轮循环完毕-------------------")
    def _handle_error(self, failue, item, spider):
        print(failue)

6. spiders/tencent.py

# -*- coding: utf-8 -*-
import scrapy
from Tencent.items import TencentItem
# https://hr.tencent.com/position.php?&start=0#a

class TencentSpider(scrapy.Spider):
    name = 'tencent'
    allowed_domains = ['tencent.com']
    baseURL = 'https://hr.tencent.com/position.php?&start='
    offset = 0
    start_urls = [baseURL + str(offset)]


    def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for node in node_list:
            item = TencentItem()                # encode("utf-8") 将字符串从unicode转换到utf-8
            item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0].encode("utf-8")
            item['positionLink'] = ("https://hr.tencent.com/" + node.xpath("./td[1]/a/@href").extract()[0]).encode("utf-8")
            if(len(node.xpath("./td[2]/text()"))):
                item['positionType'] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8")
            else:
                item['positionType'] = "无类别".encode("utf-8")
            item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8")
            item['workLocation'] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8")
            item['publishTime']  = node.xpath("./td[5]/text()").extract()[0].encode("utf-8")

            # yield 是返回数据后还能回来接着执行代码
            yield item

        #　第二种方法，提取下一页，爬去全部页面
        #  如果xpath取不到值 ==0  就不是最终页
        if len(response.xpath("//a[@class='noactive' and @id='next']")) == 0:
            nextUrl = "https://hr.tencent.com/" + response.xpath("//a[@id='next']/@href").extract()[0]
            yield scrapy.Request(nextUrl, callback=self.parse)

        #　第一种方法：指定爬取页数
        # if self.offset < 3980:
        #     self.offset += 10
        #     url = self.baseURL + str(self.offset)
        #     #scrapy.Request(url, callback=self.parse_next)
        #     yield scrapy.Request(url, callback=self.parse)

    # #def parse_next(self,response):
    #     #pass

    原文作者：IM余安
    原文地址: https://www.jianshu.com/p/8ff0c1760815
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。