抓取百度贴吧

采集 网络爬虫吧 的所有贴吧信息

http://tieba.baidu.com/f?ie=utf-8&kw=%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB&fr=search

解决问题思路:

  1. 确认需求数据在哪

    右键查看源代码

  2. Fidder模拟发送数据

源码

# -*- coding:utf-8 -*-
import urllib2
import urllib
from lxml import etree
import chardet
import json
import codecs


def GetTimeByArticle(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    resHtml = response.read()
    html = etree.HTML(resHtml)
    time = html.xpath('//span[@class="tail-info"]')[1].text
    print time
    return time


def main():
    output = codecs.open('tieba0812.json', 'w', encoding='utf-8')

    for pn in range(0, 250, 50):

        kw = u'网络爬虫'.encode('utf-8')

        url = 'http://tieba.baidu.com/f?kw=' + urllib.quote(kw) + '&ie=utf-8&pn=' + str(pn)
        print url
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)

        resHtml = response.read()
        print resHtml

        html_dom = etree.HTML(resHtml)
        # print etree.tostring(html_dom)
        html = html_dom

        # site = html.xpath('//li[@data-field]')[0]
        for site in html.xpath('//li[@data-field]'):
            # print etree.tostring(site.xpath('.//a')[0])
            title = site.xpath('.//a')[0].text
            Article_url = site.xpath('.//a')[0].attrib['href']
            reply_date = GetTimeByArticle('http://tieba.baidu.com' + Article_url)

            jieshao = site.xpath('.//*[@class="threadlist_abs threadlist_abs_onlyline "]')[0].text.strip()
            author = site.xpath('.//*[@class="frs-author-name j_user_card "]')[0].text.strip()
            lastName = site.xpath('.//*[@class="frs-author-name j_user_card "]')[1].text.strip()
            print title, jieshao, Article_url, author, lastName

            item = {}

            item['title'] = title
            item['author'] = author
            item['lastName'] = lastName
            item['reply_date'] = reply_date
            print item

            line = json.dumps(item, ensure_ascii=False)
            print line
            print type(line)

            output.write(line + "\n")
        output.close()
    print 'end'

if __name__ == '__main__':
    main()