阳光热线问政平台

http://wz.sun0769.com/index.php/question/questionType?type=4

items.py:添加以下代码

from scrapy.item import Item, Field

class SunItem(Item):
    number = Field()
    url = Field()
    title = Field()
    content = Field()

在spiders目录下新建一个自定义SunSpider.py

from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from tutorial.items import SunItem
import scrapy
import urllib
import time
import re


class SunSpider(CrawlSpider):
    name = 'sun0769'
    num = 0
    allow_domain = ['http://wz.sun0769.com/']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4'
                  ]

    rules = {
        Rule(LinkExtractor(allow='page'), process_links='process_request', follow=True),
        Rule(LinkExtractor(allow='/html/question/\d+/\d+\.shtml$'), callback='parse_content')
    }

    def process_request(self, links):
        ret=[]

        for link in links:
            try:
                page = re.search('page=\d*', link.url).group()
                type = re.search('type=\d+', link.url).group()
                link.url ='http://wz.sun0769.com/index.php/question/questionType?' + page + "&" + type
            except Exception, e:
                pass
            ret.append(link)
        return ret

    def parse_content(self, response):
        item = SunItem()

        url = response.url

        title = response.xpath('//*[@class="greyframe"]/div/div/strong/text()')[0].extract().strip()

        number = response.xpath('//*[@class="greyframe"]/div/div/strong/text()')[0].extract().strip().split(':')[-1]

        content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()[0].strip()

        item['url'] = url
        item['title'] = title
        item['number'] = number
        item['content'] = content
        yield item

在pipelines.py:添加如下代码

import json
import codecs

class JsonWriterPipeline(object):

    def __init__(self):
        self.file = codecs.open('items.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item

    def spider_closed(self, spider):
        self.file.close()

settings.py:添加如下代码(启用组件)

ITEM_PIPELINES = {
    'tutorial.pipelines.JsonWriterPipeline': 300,
}

window 下调试

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdline
cmdline.execute('scrapy crawl sun0769'.split())