腾讯招聘

http://hr.tencent.com/position.php

items.py:添加以下代码

from scrapy.item import Item, Field

class TencentItem(Item):
    title = Field()
    catalog = Field()
    workLocation = Field()
    recruitNumber = Field()
    duty = Field()
    Job_requirement= Field()
    url = Field()
    publishTime = Field()

在spiders目录下新建一个自定义tencent_info.py

# -*- coding:utf-8 -*-
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from tutorial.items import TencentItem


class TencentSpider(CrawlSpider):
    name = "tengxun_info"
    allowed_domains = ["tencent.com"]
    start_urls = [
        "http://hr.tencent.com/position.php"
    ]

    rules = [
        Rule(LinkExtractor(allow=("start=\d+"))),
        Rule(LinkExtractor(allow=("position_detail\.php")), follow=True, callback='parse_item')
    ]

    def parse_item(self,response):

        item =TencentItem()

        title = response.xpath('//*[@id="sharetitle"]/text()')[0].extract()
        workLocation = response.xpath('//*[@class="lightblue l2"]/../text()')[0].extract()
        catalog = response.xpath('//*[@class="lightblue"]/../text()')[0].extract()
        recruitNumber = response.xpath('//*[@class="lightblue"]/../text()').re('(\d+)')[0]
        duty_pre = response.xpath('//*[@class="squareli"]')[0].extract()
        duty = re.sub('<.*?>','',duty_pre)

        Job_requirement_pre = response.xpath('//*[@class="squareli"]')[1].extract()
        Job_requirement = re.sub('<.*?>','',Job_requirement_pre)

        item['title']=title
        item['url']=response.url
        item['workLocation']=workLocation
        item['catalog']=catalog
        item['recruitNumber']=recruitNumber
        item['duty']=duty
        item['Job_requirement']=Job_requirement

        yield item

在pipelines.py:添加如下代码

import json
import codecs

class JsonWriterPipeline(object):

    def __init__(self):
        self.file = codecs.open('items.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item

    def spider_closed(self, spider):
        self.file.close()

settings.py:添加如下代码(启用组件)

ITEM_PIPELINES = {
    'tutorial.pipelines.JsonWriterPipeline': 300,
}

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdline
cmdline.execute('scrapy crawl tengxun_info'.split())