阳光热线问政平台
http://wz.sun0769.com/index.php/question/questionType?type=4
items.py:添加以下代码
from scrapy.item import Item, Field
class SunItem(Item):
number = Field()
url = Field()
title = Field()
content = Field()
在spiders目录下新建一个自定义SunSpider.py
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from tutorial.items import SunItem
import scrapy
import urllib
import time
import re
class SunSpider(CrawlSpider):
name = 'sun0769'
num = 0
allow_domain = ['http://wz.sun0769.com/']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4'
]
rules = {
Rule(LinkExtractor(allow='page'), process_links='process_request', follow=True),
Rule(LinkExtractor(allow='/html/question/\d+/\d+\.shtml$'), callback='parse_content')
}
def process_request(self, links):
ret=[]
for link in links:
try:
page = re.search('page=\d*', link.url).group()
type = re.search('type=\d+', link.url).group()
link.url ='http://wz.sun0769.com/index.php/question/questionType?' + page + "&" + type
except Exception, e:
pass
ret.append(link)
return ret
def parse_content(self, response):
item = SunItem()
url = response.url
title = response.xpath('//*[@class="greyframe"]/div/div/strong/text()')[0].extract().strip()
number = response.xpath('//*[@class="greyframe"]/div/div/strong/text()')[0].extract().strip().split(':')[-1]
content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()[0].strip()
item['url'] = url
item['title'] = title
item['number'] = number
item['content'] = content
yield item
在pipelines.py:添加如下代码
import json
import codecs
class JsonWriterPipeline(object):
def __init__(self):
self.file = codecs.open('items.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
settings.py:添加如下代码(启用组件)
ITEM_PIPELINES = {
'tutorial.pipelines.JsonWriterPipeline': 300,
}
window 下调试
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline
cmdline.execute('scrapy crawl sun0769'.split())