腾讯招聘
http://hr.tencent.com/position.php
items.py:添加以下代码
from scrapy.item import Item, Field
class TencentItem(Item):
title = Field()
catalog = Field()
workLocation = Field()
recruitNumber = Field()
duty = Field()
Job_requirement= Field()
url = Field()
publishTime = Field()
在spiders目录下新建一个自定义tencent_info.py
# -*- coding:utf-8 -*-
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from tutorial.items import TencentItem
class TencentSpider(CrawlSpider):
name = "tengxun_info"
allowed_domains = ["tencent.com"]
start_urls = [
"http://hr.tencent.com/position.php"
]
rules = [
Rule(LinkExtractor(allow=("start=\d+"))),
Rule(LinkExtractor(allow=("position_detail\.php")), follow=True, callback='parse_item')
]
def parse_item(self,response):
item =TencentItem()
title = response.xpath('//*[@id="sharetitle"]/text()')[0].extract()
workLocation = response.xpath('//*[@class="lightblue l2"]/../text()')[0].extract()
catalog = response.xpath('//*[@class="lightblue"]/../text()')[0].extract()
recruitNumber = response.xpath('//*[@class="lightblue"]/../text()').re('(\d+)')[0]
duty_pre = response.xpath('//*[@class="squareli"]')[0].extract()
duty = re.sub('<.*?>','',duty_pre)
Job_requirement_pre = response.xpath('//*[@class="squareli"]')[1].extract()
Job_requirement = re.sub('<.*?>','',Job_requirement_pre)
item['title']=title
item['url']=response.url
item['workLocation']=workLocation
item['catalog']=catalog
item['recruitNumber']=recruitNumber
item['duty']=duty
item['Job_requirement']=Job_requirement
yield item
在pipelines.py:添加如下代码
import json
import codecs
class JsonWriterPipeline(object):
def __init__(self):
self.file = codecs.open('items.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
settings.py:添加如下代码(启用组件)
ITEM_PIPELINES = {
'tutorial.pipelines.JsonWriterPipeline': 300,
}
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline
cmdline.execute('scrapy crawl tengxun_info'.split())