拉钩招聘网

以拉钩具体详情页为例,进行抓取

http://www.lagou.com/jobs/2101463.html

from lxml import etree
import requests
import re

response = requests.get('http://www.lagou.com/jobs/2101463.html')
resHtml = response.text

html = etree.HTML(resHtml)

title = html.xpath('//h1[@title]')[0].attrib['title']
#salary= html.xpath('//span[@class="red"]')[0].text

salary = html.xpath('//dd[@class="job_request"]/p/span')[0].text
worklocation = html.xpath('//dd[@class="job_request"]/p/span')[1].text
experience = html.xpath('//dd[@class="job_request"]/p/span')[2].text
education = html.xpath('//dd[@class="job_request"]/p/span')[3].text
worktype = html.xpath('//dd[@class="job_request"]/p/span')[4].text
Temptation = html.xpath('//dd[@class="job_request"]/p[2]')[0].text

print salary,worklocation,experience,education,worktype,Temptation

description_tag = html.xpath('//dd[@class="job_bt"]')[0]
description =  etree.tostring( description_tag,encoding='utf-8')
#print description
deal_descp =  re.sub('<.*?>','',description)
print deal_descp.strip()
publisher_name =  html.xpath('//*[@class="publisher_name"]//@title')[0]
pos =  html.xpath('//*[@class="pos"]')[0].text
chuli_lv =  html.xpath('//*[@class="data"]')[0].text
chuli_yongshi =  html.xpath('//*[@class="data"]')[1].text

print chuli_lv,chuli_yongshi,pos,publisher_name