国家食品药品监督管理总局
http://app1.sfda.gov.cn/datasearch/face3/dir.html
items.py:添加以下代码
from scrapy import Field
import scrapy
class Sfda1Item(scrapy.Item):
# define the fields for your item here like:
data = scrapy.Field()
在spiders目录下新建一个自定义spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import FormRequest
from tutorial.items import Sfda1Item
import urllib
import re
class sfdaSpider(scrapy.Spider):
name = 'sfda'
allowed_domains = ['sfda.gov.cn']
def start_requests(self):
url = 'http://app1.sfda.gov.cn/datasearch/face3/search.jsp'
data = {
'tableId': '32',
'State': '1',
'bcId': '124356639813072873644420336632',
'State': '1',
'tableName': 'TABLE32',
'State': '1',
'viewtitleName': 'COLUMN302',
'State': '1',
'viewsubTitleName': 'COLUMN299,COLUMN303',
'State': '1',
'curstart': '1',
'State': '1',
'tableView': urllib.quote("国产药品商品名"),
'State': '1',
}
yield FormRequest(url=url, formdata=data, meta={'data': data}, callback=self.parseContent)
def parseContent(self, response):
for site in response.xpath('//a').re(r'callbackC,\'(.*?)\',null'):
id = re.search('.+Id=(.*?)$', site).group(1)
#print id
url = 'http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=32&tableName=TABLE32&tableView=%B9%FA%B2%FA%D2%A9%C6%B7%C9%CC%C6%B7%C3%FB&Id=' + id
yield scrapy.Request(url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
},
callback=self.ParseDetail)
data = response.meta['data']
data['curstart'] = str(int(data['curstart']) + 1)
yield FormRequest(url=response.request.url, formdata=data, meta={'data': data}, callback=self.parseContent)
def ParseDetail(self, response):
item = dict()
for site in response.xpath('//table[1]/.//tr')[1:-1]:
try:
if not site.xpath('./td/text()').extract()[0]:
continue
name = site.xpath('./td/text()').extract()[0]
value = re.sub('<.*?>', '', site.xpath('./td')[1].extract()).strip()
print name, value
item[name] = value
except Exception, e:
print 'error', e
sfa = Sfda1Item()
sfa['data'] = item
yield sfa
在pipelines.py:添加如下代码
import json
import codecs
class JsonWriterPipeline(object):
def __init__(self):
self.file = codecs.open('items.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
settings.py:添加如下代码(启用组件)
ITEM_PIPELINES = {
'tutorial.pipelines.JsonWriterPipeline': 300,
}
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline
cmdline.execute('scrapy crawl sfda -L INFO'.split())