丁香园防禁封策略-分布式实战

我们以丁香园用药助手项目为例

架构示意图如下:

首先通过药理分类采集一遍,按照drug_id排序,发现:

我们要完成 http://drugs.dxy.cn/drug/[50000-150000].htm

正常采集:

异常数据情况包括如下:

  • 药品不存在

  • 当采集频率过快,弹出验证码

  • 当天采集累计操作次数过多,弹出禁止

这个时候就需要用到代理

项目流程

1. 创建项目

scrapy startproject drugs_dxy

创建spider

cd drugs_dxy/
scrapy genspider -t basic Drugs dxy.cn

2. items.py下添加类DrugsItem

class DrugsItem(scrapy.Item): # define the fields for your item here like: #药品不存在标记 exists = scrapy.Field() #药品id drugtId = scrapy.Field() #数据 data = scrapy.Field() #标记验证码状态 msg = scrapy.Field() pass

3. 编辑spider下DrugsSpider类

# -*- coding: utf-8 -*- import scrapy from drugs_dxy.items import DrugsItem import re class DrugsSpider(scrapy.Spider): name = "Drugs" allowed_domains = ["dxy.cn"] size = 60 def start_requests(self): for i in xrange(50000,50000+self.size,1): url ='http://drugs.dxy.cn/drug/%d.htm' % (i) yield scrapy.Request(url=url,callback=self.parse) def parse(self, response): drug_Item = DrugsItem() drug_Item["drugtId"] = int(re.search('(\d+)',response.url).group(1)) if drug_Item["drugtId"]>=150000: return url ='http://drugs.dxy.cn/drug/%d.htm' % (drug_Item["drugtId"]+self.size) yield scrapy.Request(url=url,callback=self.parse) if '药品不存在' in response.body: drug_Item['exists'] = False yield drug_Item return if '请填写验证码继续正常访问' in response.body: drug_Item["msg"] = u'请填写验证码继续正常访问' return drug_Item["data"] = {} details = response.xpath("//dt") for detail in details: detail_name = detail.xpath('./span/text()').extract()[0].split(':')[0] if detail_name ==u'药品名称': drug_Item['data'][u'药品名称'] = {} try: detail_str = detail.xpath("./following-sibling::*[1]") detail_value = detail_str.xpath('string(.)').extract()[0].replace('\r','').replace('\t','').strip() for item in detail_value.split('\n'): item = item.replace('\r','').replace('\n','').replace('\t','').strip() name = item.split(u':')[0] value = item.split(u':')[1] #print name,value drug_Item['data'][u'药品名称'][name] = value except: pass else: detail_str = detail.xpath("./following-sibling::*[1]") detail_value = detail_str.xpath('string(.)').extract()[0].replace('\r','').replace('\t','').strip() #print detail_str,detail_value drug_Item['data'][detail_name] = detail_value yield drug_Item

4. Scrapy代理设置

4.1 在settings.py文件里

1)启用scrapy_redis组件

# Enables scheduling storing requests queue in redis. SCHEDULER = "scrapy_redis.scheduler.Scheduler" # Ensure all spiders share same duplicates filter through redis. DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 300 } # Specify the host and port to use when connecting to Redis (optional). REDIS_HOST = '101.200.170.171' REDIS_PORT = 6379 # Custom redis client parameters (i.e.: socket timeout, etc.) REDIS_PARAMS = {} #REDIS_URL = 'redis://user:pass@hostname:9001' REDIS_PARAMS['password'] = 'itcast.cn'

2) 启用DownLoader中间件;httpproxy

# Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'drugs_dxy.middlewares.ProxyMiddleware': 400, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, }

3) 设置禁止跳转(code=301、302),超时时间90s

DOWNLOAD_TIMEOUT = 90 REDIRECT_ENABLED = False

4.2 在drugs_dxy目录下创建middlewares.py并编辑(settings.py同级目录)

# -*- coding: utf-8 -*- import random import base64 import Queue import redis class ProxyMiddleware(object): def __init__(self, settings): self.queue = 'Proxy:queue' # 初始化代理列表 self.r = redis.Redis(host=settings.get('REDIS_HOST'),port=settings.get('REDIS_PORT'),db=1,password=settings.get('REDIS_PARAMS')['password']) @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def process_request(self, request, spider): proxy={} source, data = self.r.blpop(self.queue) proxy['ip_port']=data proxy['user_pass']=None if proxy['user_pass'] is not None: #request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT" request.meta['proxy'] = "http://%s" % proxy['ip_port'] #proxy_user_pass = "USERNAME:PASSWORD" encoded_user_pass = base64.encodestring(proxy['user_pass']) request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass print "********ProxyMiddleware have pass*****" + proxy['ip_port'] else: #ProxyMiddleware no pass print request.url, proxy['ip_port'] request.meta['proxy'] = "http://%s" % proxy['ip_port'] def process_response(self, request, response, spider): """ 检查response.status, 根据status是否在允许的状态码中决定是否切换到下一个proxy, 或者禁用proxy """ print("-------%s %s %s------" % (request.meta["proxy"], response.status, request.url)) # status不是正常的200而且不在spider声明的正常爬取过程中可能出现的 # status列表中, 则认为代理无效, 切换代理 if response.status == 200: print 'rpush',request.meta["proxy"] self.r.rpush(self.queue, request.meta["proxy"].replace('http://','')) return response def process_exception(self, request, exception, spider): """ 处理由于使用代理导致的连接异常 """ proxy={} source, data = self.r.blpop(self.queue) proxy['ip_port']=data proxy['user_pass']=None request.meta['proxy'] = "http://%s" % proxy['ip_port'] new_request = request.copy() new_request.dont_filter = True return new_request

5. 运行