国家食品药品监督管理总局

http://app1.sfda.gov.cn/datasearch/face3/dir.html

items.py:添加以下代码

from scrapy import Field
import scrapy

class Sfda1Item(scrapy.Item):
    # define the fields for your item here like:
    data = scrapy.Field()

在spiders目录下新建一个自定义spider

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import FormRequest
from tutorial.items import Sfda1Item
import urllib
import re


class sfdaSpider(scrapy.Spider):
    name = 'sfda'
    allowed_domains = ['sfda.gov.cn']

    def start_requests(self):
        url = 'http://app1.sfda.gov.cn/datasearch/face3/search.jsp'
        data = {
            'tableId': '32',
            'State': '1',
            'bcId': '124356639813072873644420336632',
            'State': '1',
            'tableName': 'TABLE32',
            'State': '1',
            'viewtitleName': 'COLUMN302',
            'State': '1',
            'viewsubTitleName': 'COLUMN299,COLUMN303',
            'State': '1',
            'curstart': '1',
            'State': '1',
            'tableView': urllib.quote("国产药品商品名"),
            'State': '1',
        }
        yield FormRequest(url=url, formdata=data, meta={'data': data}, callback=self.parseContent)

    def parseContent(self, response):

        for site in response.xpath('//a').re(r'callbackC,\'(.*?)\',null'):
            id = re.search('.+Id=(.*?)$', site).group(1)
            #print id

            url = 'http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=32&tableName=TABLE32&tableView=%B9%FA%B2%FA%D2%A9%C6%B7%C9%CC%C6%B7%C3%FB&Id=' + id
            yield scrapy.Request(url,
                                 headers={
                                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
                                 },
                                 callback=self.ParseDetail)

        data = response.meta['data']
        data['curstart'] = str(int(data['curstart']) + 1)
        yield FormRequest(url=response.request.url, formdata=data, meta={'data': data}, callback=self.parseContent)

    def ParseDetail(self, response):
        item = dict()
        for site in response.xpath('//table[1]/.//tr')[1:-1]:
            try:
                if not site.xpath('./td/text()').extract()[0]:
                    continue
                name = site.xpath('./td/text()').extract()[0]
                value = re.sub('<.*?>', '', site.xpath('./td')[1].extract()).strip()
                print name, value

                item[name] = value
            except Exception, e:
                print 'error', e
        sfa = Sfda1Item()
        sfa['data'] = item
        yield sfa

在pipelines.py:添加如下代码

import json
import codecs

class JsonWriterPipeline(object):

    def __init__(self):
        self.file = codecs.open('items.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item

    def spider_closed(self, spider):
        self.file.close()

settings.py:添加如下代码(启用组件)

ITEM_PIPELINES = {
    'tutorial.pipelines.JsonWriterPipeline': 300,
}

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdline
cmdline.execute('scrapy crawl sfda -L INFO'.split())