美团App热门商圈团购采集(2)
把上节内容生成的城市信息 items.json改成city_items.json 作为第二部分爬虫的启动数据
添加items.py
class MeituanItem(Item):
data = Field()
创建模板:
scrapy genspider -t basic Meituan_meishi meituan.com
添加以下代码到Meituan_meishi.py
# -*- coding: utf-8 -*-
import scrapy
import codecs
import json
from tutorial.items import MeituanItem
import re
class MeituanMeishiSpider(scrapy.Spider):
'''
美食团购页面信息采集
'''
name = "Meituan_meishi"
allowed_domains = ["meituan.com"]
'''
start_urls = (
'http://www.meituan.com/',
)
'''
offset = 0
def start_requests(self):
file = codecs.open('city_items.json', 'r', encoding='utf-8')
for line in file:
item = json.loads(line)
cityid = item['data']['cityid']
latitude = item['data']['latitude']
longitude= item['data']['longitude']
lat = round(float(latitude), 6)
lng= round(float(longitude), 6)
url = 'http://api.mobile.meituan.com/group/v4/deal/select/city/42/cate/1?sort=defaults&mypos='+ str(lat) +'%2C'+ str(lng) +'&offset=0&limit=15'
yield scrapy.Request(url,callback=self.parse)
break
file.close()
def parse(self, response):
'''
数据存储以及翻页操作
'''
item = MeituanItem()
data = json.loads(response.body)
item['data']=dict()
item['data'] = data
yield item
offset = re.search('offset=(\d+)',response.request.url).group(1)
url = re.sub('offset=\d+','offset='+str(int(offset)+15),response.request.url)
yield scrapy.Request(url,callback=self.parse)
运行:
scrapy runspider tutorial/spiders/Meituan_meishi.py