抓取百度贴吧
采集 网络爬虫吧 的所有贴吧信息
http://tieba.baidu.com/f?ie=utf-8&kw=%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB&fr=search
解决问题思路:
确认需求数据在哪
右键查看源代码
Fidder模拟发送数据
源码
# -*- coding:utf-8 -*-
import urllib2
import urllib
from lxml import etree
import chardet
import json
import codecs
def GetTimeByArticle(url):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
resHtml = response.read()
html = etree.HTML(resHtml)
time = html.xpath('//span[@class="tail-info"]')[1].text
print time
return time
def main():
output = codecs.open('tieba0812.json', 'w', encoding='utf-8')
for pn in range(0, 250, 50):
kw = u'网络爬虫'.encode('utf-8')
url = 'http://tieba.baidu.com/f?kw=' + urllib.quote(kw) + '&ie=utf-8&pn=' + str(pn)
print url
request = urllib2.Request(url)
response = urllib2.urlopen(request)
resHtml = response.read()
print resHtml
html_dom = etree.HTML(resHtml)
# print etree.tostring(html_dom)
html = html_dom
# site = html.xpath('//li[@data-field]')[0]
for site in html.xpath('//li[@data-field]'):
# print etree.tostring(site.xpath('.//a')[0])
title = site.xpath('.//a')[0].text
Article_url = site.xpath('.//a')[0].attrib['href']
reply_date = GetTimeByArticle('http://tieba.baidu.com' + Article_url)
jieshao = site.xpath('.//*[@class="threadlist_abs threadlist_abs_onlyline "]')[0].text.strip()
author = site.xpath('.//*[@class="frs-author-name j_user_card "]')[0].text.strip()
lastName = site.xpath('.//*[@class="frs-author-name j_user_card "]')[1].text.strip()
print title, jieshao, Article_url, author, lastName
item = {}
item['title'] = title
item['author'] = author
item['lastName'] = lastName
item['reply_date'] = reply_date
print item
line = json.dumps(item, ensure_ascii=False)
print line
print type(line)
output.write(line + "\n")
output.close()
print 'end'
if __name__ == '__main__':
main()