用requests爬取一个招聘网站
生活随笔
收集整理的这篇文章主要介绍了
用requests爬取一个招聘网站
小编觉得挺不错的,现在分享给大家,帮大家做个参考.
import requests
import re
session = requests.session()
第一步:访问登陆页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code # 1、请求url:https://passport.lagou.com/login/login.html
# 2、请求方法:GET 因为是get请求不需要请求体
# 3、请求头:User-agent 代码如下: r1 = session.get('https://passport.lagou.com/login/login.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',},)X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] #正则表达式获取的值是一个列表 X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# 2、请求方法:POST
# 3、请求头:包含:cookie,User-agent,Referer,X-Anit-Forge-Code,X-Anit-Forge-Token
# 4、请求体包含如下: # isValidate:true
# username:18611453110
# password:70621c64832c4d4d66a47be6150b4a8e
# request_form_verifyCode:''
# submit:''
代码如下: r2 = session.post('https://passport.lagou.com/login/login.json',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': 'https://passport.lagou.com/login/login.html','X-Anit-Forge-Code': X_Anti_Forge_Code,'X-Anit-Forge-Token': X_Anti_Forge_Token,'X-Requested-With': 'XMLHttpRequest'},data={"isValidate": True,'username': '18611453110',#这是登陆的用户名,'password': '70621c64832c4d4d66a47be6150b4a8e',#这是加密的密码'request_form_verifyCode': '','submit': ''})
# 2、请求方法:GET
# 3、请求头:包含:User-agent,Referer r3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': 'https://passport.lagou.com/login/login.html',}) 第四步:验证是登陆成功: r4 = session.get('https://www.lagou.com/resume/myresume.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',})
# print('18611453110' in r4.text)#验证是否登陆成功
# 请求方法:GET
# 请求头:
# User-Agent
# 请求参数:
# gj:3年及以下
# px:default
# yx:25k-50k
# city:北京 from urllib.parse import urlencoderes = urlencode({'k': 'java高级开发'}, encoding='utf-8').split('=')[-1] url = 'https://www.lagou.com/jobs/list_' + res # r5 = session.get(url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', # }, # params={ # 'gj': '3年及以下', # 'px': 'default', # 'yx': '25k-50k', # 'city': '北京' # } # ) # # print(r5.text)
#请求方法:POST
#请求头
# Referer
# User-Agent
#请求体:
# first:true
# pn:1
# kd:java高级开发
#请求参数
# params={
# 'gj': '3年及以下',
# 'px': 'default',
# 'yx': '25k-50k',
# 'city': '北京',
# 'needAddtionalResult':False,
# 'isSchoolJob':0
# } r6=session.post('https://www.lagou.com/jobs/positionAjax.json',headers={'Referer':url,'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',},data={'first':True,'pn':1,'kd':'java高级开发'},params={'gj': '3年及以下','px': 'default','yx': '25k-50k','city': '北京','needAddtionalResult': False,'isSchoolJob': 0}) comapines_list=r6.json()['content']['positionResult']['result'] for comapiny in comapines_list:positionId=comapiny['positionId']company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)companyShortName = comapiny['companyShortName']positionName = comapiny['positionName']salary = comapiny['salary']print('''详情连接:%s公司名:%s职位名:%s薪资:%s''' %(company_link,companyShortName,positionName,salary)) #第七步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
# 请求url:详情页地址
# 请求方式:GET
# 请求头:User-Agent r7=session.get(company_link,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',})X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] #第八步:投递简历
#请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
#请求方式:POST
#请求头:
#Referer:详情页地址
#User-agent
#X-Anit-Forge-Code:53165984
#X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
#X-Requested-With:XMLHttpRequest
#请求体:
# positionId:职位ID
# type:1
# force:true session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': company_link,'X-Anit-Forge-Code': X_Anti_Forge_Code,'X-Anit-Forge-Token': X_Anti_Forge_Token,'X-Requested-With': 'XMLHttpRequest'},data={'positionId':positionId,'type':1,'force':True})print('%s 投递成功' %(companyShortName))
第六步找到一个公司,进入详情页,然后投递简历。
import re
session = requests.session()
第一步:访问登陆页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code # 1、请求url:https://passport.lagou.com/login/login.html
# 2、请求方法:GET 因为是get请求不需要请求体
# 3、请求头:User-agent 代码如下: r1 = session.get('https://passport.lagou.com/login/login.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',},)X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] #正则表达式获取的值是一个列表 X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
第二步:登陆
# 1、请求url:https://passport.lagou.com/login/login.json# 2、请求方法:POST
# 3、请求头:包含:cookie,User-agent,Referer,X-Anit-Forge-Code,X-Anit-Forge-Token
# 4、请求体包含如下: # isValidate:true
# username:18611453110
# password:70621c64832c4d4d66a47be6150b4a8e
# request_form_verifyCode:''
# submit:''
代码如下: r2 = session.post('https://passport.lagou.com/login/login.json',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': 'https://passport.lagou.com/login/login.html','X-Anit-Forge-Code': X_Anti_Forge_Code,'X-Anit-Forge-Token': X_Anti_Forge_Token,'X-Requested-With': 'XMLHttpRequest'},data={"isValidate": True,'username': '18611453110',#这是登陆的用户名,'password': '70621c64832c4d4d66a47be6150b4a8e',#这是加密的密码'request_form_verifyCode': '','submit': ''})
第三步:授权
1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html# 2、请求方法:GET
# 3、请求头:包含:User-agent,Referer r3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': 'https://passport.lagou.com/login/login.html',}) 第四步:验证是登陆成功: r4 = session.get('https://www.lagou.com/resume/myresume.html',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',})
# print('18611453110' in r4.text)#验证是否登陆成功
第五步:筛选职位信息
# 请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91# 请求方法:GET
# 请求头:
# User-Agent
# 请求参数:
# gj:3年及以下
# px:default
# yx:25k-50k
# city:北京 from urllib.parse import urlencoderes = urlencode({'k': 'java高级开发'}, encoding='utf-8').split('=')[-1] url = 'https://www.lagou.com/jobs/list_' + res # r5 = session.get(url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', # }, # params={ # 'gj': '3年及以下', # 'px': 'default', # 'yx': '25k-50k', # 'city': '北京' # } # ) # # print(r5.text)
没有取到数据,因为数据是通过ajax发送的,所以我们换另一种方法解决:
#请求url:https://www.lagou.com/jobs/positionAjax.json#请求方法:POST
#请求头
# Referer
# User-Agent
#请求体:
# first:true
# pn:1
# kd:java高级开发
#请求参数
# params={
# 'gj': '3年及以下',
# 'px': 'default',
# 'yx': '25k-50k',
# 'city': '北京',
# 'needAddtionalResult':False,
# 'isSchoolJob':0
# } r6=session.post('https://www.lagou.com/jobs/positionAjax.json',headers={'Referer':url,'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',},data={'first':True,'pn':1,'kd':'java高级开发'},params={'gj': '3年及以下','px': 'default','yx': '25k-50k','city': '北京','needAddtionalResult': False,'isSchoolJob': 0}) comapines_list=r6.json()['content']['positionResult']['result'] for comapiny in comapines_list:positionId=comapiny['positionId']company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)companyShortName = comapiny['companyShortName']positionName = comapiny['positionName']salary = comapiny['salary']print('''详情连接:%s公司名:%s职位名:%s薪资:%s''' %(company_link,companyShortName,positionName,salary)) #第七步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
# 请求url:详情页地址
# 请求方式:GET
# 请求头:User-Agent r7=session.get(company_link,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',})X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] #第八步:投递简历
#请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
#请求方式:POST
#请求头:
#Referer:详情页地址
#User-agent
#X-Anit-Forge-Code:53165984
#X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
#X-Requested-With:XMLHttpRequest
#请求体:
# positionId:职位ID
# type:1
# force:true session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Referer': company_link,'X-Anit-Forge-Code': X_Anti_Forge_Code,'X-Anit-Forge-Token': X_Anti_Forge_Token,'X-Requested-With': 'XMLHttpRequest'},data={'positionId':positionId,'type':1,'force':True})print('%s 投递成功' %(companyShortName))
第7步,8步是并列的,放在第六步的里面。
第六步找到一个公司,进入详情页,然后投递简历。
转载于:https://www.cnblogs.com/1a2a/p/8305165.html
总结
以上是生活随笔为你收集整理的用requests爬取一个招聘网站的全部内容,希望文章能够帮你解决所遇到的问题。
- 上一篇: postman 变量
- 下一篇: Node.js 常用Mongoose方法