美团App热门商圈团购采集(2)

把上节内容生成的城市信息 items.json改成city_items.json 作为第二部分爬虫的启动数据

添加items.py

class MeituanItem(Item):
    data = Field()

创建模板:

scrapy genspider -t basic Meituan_meishi meituan.com

添加以下代码到Meituan_meishi.py

# -*- coding: utf-8 -*-
import scrapy
import codecs
import json
from tutorial.items import MeituanItem
import re

class MeituanMeishiSpider(scrapy.Spider):
    '''
    美食团购页面信息采集
    '''
    name = "Meituan_meishi"
    allowed_domains = ["meituan.com"]
    '''
    start_urls = (
        'http://www.meituan.com/',
    )
    '''
    offset = 0
    def start_requests(self):
        file = codecs.open('city_items.json', 'r', encoding='utf-8')
        for line in file:
            item = json.loads(line)
            cityid = item['data']['cityid']
            latitude = item['data']['latitude']
            longitude= item['data']['longitude']

            lat = round(float(latitude), 6)
            lng= round(float(longitude), 6)

            url = 'http://api.mobile.meituan.com/group/v4/deal/select/city/42/cate/1?sort=defaults&mypos='+ str(lat) +'%2C'+ str(lng) +'&offset=0&limit=15'
            yield scrapy.Request(url,callback=self.parse)
            break
        file.close()


    def parse(self, response):
        '''
        数据存储以及翻页操作
        '''
        item = MeituanItem()

        data = json.loads(response.body)
        item['data']=dict()
        item['data'] = data
        yield item

        offset = re.search('offset=(\d+)',response.request.url).group(1)
        url = re.sub('offset=\d+','offset='+str(int(offset)+15),response.request.url)
        yield scrapy.Request(url,callback=self.parse)

运行:

scrapy runspider tutorial/spiders/Meituan_meishi.py