用Python分析哪些公司的数据分析岗值得去

上传人：1*** IP属地：江苏上传时间：2023-05-29 格式：DOC 页数：38 大小：1.30MB 积分：35 举报 版权申诉

已阅读5页，还剩33页未读，继续免费阅读

版权说明：本文档由用户提供并上传，收益归属内容提供方，若内容存在侵权，请进行举报或认领

文档简介

用Python分析哪些企业数据分析岗值得去讲道理，pyspider确实是一款优异爬虫框架，我们能够利用它快速方便地实现一个页面抓取。不过带来便捷性同时，也有它不足，复杂页面不好爬取。在此次数据爬取中，BOSS直聘是成功使用pyspider。但拉勾网却不行，因为拉勾网数据是Ajax加载。拉勾网岗位数据请求网址是不变，改变是表单数据，表单数据伴随页数改变，请求方式为POST。这里没方法在pyspider里用循环遍一向获取每一页数据。可能是我对pyspider框架了解不够，还达不到得心应手。所以最终拉勾网爬取，采取日常方法，在PyCharm中自行编写程序。此次经过对BOSS直聘，拉勾网数据分析岗数据分析，了解数据分析岗行业情况，也以此来了解从事数据分析所需要技能。/01/网页分析获取BOSS直聘索引页信息，主要是岗位名称、薪资、地点、工作年限、学历要求，企业名称、类型、状态、规模。原来一开始是想对详情页分析，还能够获取详情页里工作内容和工作技能需求。然后因为请求太多，就放弃了。索引页有10页，1页有30个岗位，一个详情页就需要一个请求，算起来一共有300个请求。我是到了第2页(60个请求)，就出现了访问过于频繁警告。而只获取索引页信息话，只有10个请求，基本上没什么问题，外加也不想去鼓捣代理IP，所以来点简单。到时候做数据挖掘岗位数据时，看看放慢时间能否获取成功。获取拉勾网索引页信息，主要是岗位名称、地点、薪资、工作年限、学历要求，企业名称、类型、状态、规模，工作技能，工作福利。网页为Ajax请求，采取PyCharm编写代码，轻车熟路。/02/数据获取01pyspider获取BOSS直聘数据pyspider安装很简单，直接在命令行pip3installpyspider即可。这里因为之前没有安装pyspider对接PhantomJS(处理JavaScript渲染页面)。所以需要从网站下载下来它exe文件，将其放入Pythonexe文件所在文件夹下。最终在命令行输入pyspiderall，即可运行pyspider。在浏览器打开网址http://localhost:5000/，创建项目，添加项目名称，输入请求网址，得到以下列图。最终在pyspider脚本编辑器里编写代码，结合左边反馈情况，对代码加以改过。脚本编辑器详细代码以下。#!/usr/bin/env

python#

-*-

encoding:

utf-8

-*-#

Project:

BOSSfrom

pyspider.libs.base_handler

import

*import

pymysqlimport

randomimport

timeimport

recount

0class

Handler(BaseHandler):#

添加请求头,不然出现403报错crawl_config

{'headers':

{'User-Agent':

'Mozilla/5.0

(Windows

6.1;

WOW64)

AppleWebKit/537.36

(KHTML,

Gecko)

Chrome/63.0.3239.132

Safari/537.36'}}def

__init__(self):

连接数据库

self.db

pymysql.connect(host='',

user='root',

password='',

port=3306,

db='boss_job',

charset='utf8mb4')def

add_Mysql(self,

id,

job_title,

job_salary,

job_city,

job_experience,

job_education,

company_name,

company_type,

company_status,

company_people):

将数据写入数据库中

try:

cursor

self.db.cursor()

sql

'insert

into

job(id,

job_title,

job_salary,

job_city,

job_experience,

job_education,

company_name,

company_type,

company_status,

company_people)

values

("%d",

"%s",

"%s")'

(id,

job_title,

job_salary,

job_city,

job_experience,

job_education,

company_name,

company_type,

company_status,

company_people);

print(sql)

cursor.execute(sql)

print(cursor.lastrowid)

mit()

except

Exception

print(e)

self.db.rollback()@every(minutes=24

60)def

on_start(self):

因为pyspider默认是HTTP请求,对于HTTPS(加密)请求，需要添加validate_cert=False,不然599/SSL报错

self.crawl('',

callback=self.index_page,

validate_cert=False)@config(age=10

60)def

index_page(self,

response):

time.sleep(random.randint(2,

5))

for

response.doc('li

div').items():

设置全局变量

global

count

岗位名称

job_title

i('.job-title').text()

print(job_title)

岗位薪水

job_salary

i('.red').text()

print(job_salary)

岗位地点

city_result

re.search('(.*?)<em

class=',

i('.info-primary

p').html())

job_city

city_result.group(1).split('

')[0]

print(job_city)

岗位经验

experience_result

re.search('<em

class="vline"/>(.*?)<em

class="vline"/>',

i('.info-primary

p').html())

job_experience

experience_result.group(1)

print(job_experience)

岗位学历

job_education

i('.info-primary

p').text().replace('

'').replace(city_result.group(1).replace('

''),

'').replace(experience_result.group(1).replace('

''),'')

print(job_education)

企业名称

company_name

i('.info-company

a').text()

print(company_name)

企业类型

company_type_result

re.search('(.*?)<em

class=',

i('.info-company

p').html())

company_type

company_type_result.group(1)

print(company_type)

企业状态

company_status_result

re.search('<em

class="vline"/>(.*?)<em

class="vline"/>',

i('.info-company

p').html())

company_status_result:

company_status

company_status_result.group(1)

else:

company_status

'无信息'

print(company_status)

企业规模

company_people

i('.info-company

p').text().replace(company_type,

'').replace(company_status,'')

print(company_people

'')

写入数据库中

self.add_Mysql(count,

job_title,

job_salary,

job_city,

job_experience,

job_education,

company_name,

company_type,

company_status,

company_people)

获取下一页信息

response.doc('.next').attr.href

'javascript:;':

self.crawl(next,

callback=self.index_page,

validate_cert=False)

else:

print("The

Work

Done")

详情页信息获取,因为访问次数有限制,不使用

#for

each

response.doc('.name

a').items():

#url

each.attr.href

#self.crawl(each.attr.href,

callback=self.detail_page,

validate_cert=False)@config(priority=2)def

detail_page(self,

response):

详情页信息获取,因为访问次数有限制,不使用

message_job

response.doc('div

.info-primary

p').text()

city_result

re.findall('城市：(.*?)经验',

message_job)

experience_result

re.findall('经验：(.*?)学历',

message_job)

education_result

re.findall('学历：(.*)',

message_job)

message_company

response.doc('.info-company

p').text().replace(response.doc('.info-company

a').text(),'')

status_result

re.findall('(.*?)d',

message_company.split('

')[0])

people_result

message_company.split('

')[0].replace(status_result[0],

'')

return

{

"job_title":

response.doc('h1').text(),

"job_salary":

response.doc('.info-primary

.badge').text(),

"job_city":

city_result[0],

"job_experience":

experience_result[0],

"job_education":

education_result[0],

"job_skills":

response.doc('.info-primary

.job-tags

span').text(),

"job_detail":

response.doc('div').filter('.text').eq(0).text().replace('',

''),

"company_name":

response.doc('.info-company

.name

a').text(),

"company_status":

status_result[0],

"company_people":

people_result,

"company_type":

response.doc('.info-company

a').text(),

}获取BOSS直聘数据分析岗数据以下。02PyCharm获取拉勾网数据import

requestsimport

pymysqlimport

randomimport

timeimport

jsoncount

设置请求网址及请求头参数url

''headers

{'User-Agent':

'Mozilla/5.0

(Windows

6.1;

WOW64)

AppleWebKit/537.36

(KHTML,

Gecko)

Chrome/63.0.3239.132

Safari/537.36','Cookie':

'你Cookie值','Accept':

'application/json,

text/javascript,

*/*;

q=0.01','Connection':

'keep-alive','Host':

'.com','Origin':

'','Referer':

'ttps://.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=shuju'}#

连接数据库db

pymysql.connect(host='',

user='root',

password='',

port=3306,

db='lagou_job',

charset='utf8mb4')def

add_Mysql(id,

job_title,

job_salary,

job_city,

job_experience,

job_education,

company_name,

company_type,

company_status,

company_people,

job_tips,

job_welfare):#

将数据写入数据库中try:

cursor

db.cursor()

sql

'insert

into

job(id,

job_title,

job_salary,

job_city,

job_experience,

job_education,

company_name,

company_type,

company_status,

company_people,

job_tips,

job_welfare)

values

("%d",

"%s",

"%s")'

(id,

job_title,

job_salary,

job_city,

job_experience,

job_education,

company_name,

company_type,

company_status,

company_people,

job_tips,

job_welfare);

print(sql)

cursor.execute(sql)

print(cursor.lastrowid)

mit()except

Exception

print(e)

db.rollback()def

get_message():for

range(1,

31):

print('第'

str(i)

'页')

time.sleep(random.randint(10,

20))

data

{

'first':

'false',

'pn':

'kd':

'数据分析'

}

response

requests.post(url=url,

data=data,

headers=headers)

result

json.loads(response.text)

job_messages

result['content']['positionResult']['result']

for

job

job_messages:

global

count

岗位名称

job_title

job['positionName']

print(job_title)

岗位薪水

job_salary

job['salary']

print(job_salary)

岗位地点

job_city

job['city']

print(job_city)

岗位经验

job_experience

job['workYear']

print(job_experience)

岗位学历

job_education

job['education']

print(job_education)

企业名称

company_name

job['companyShortName']

print(company_name)

企业类型

company_type

job['industryField']

print(company_type)

企业状态

company_status

job['financeStage']

print(company_status)

企业规模

company_people

job['companySize']

print(company_people)

工作技能

len(job['positionLables'])

job_tips

','.join(job['positionLables'])

else:

job_tips

'None'

print(job_tips)

工作福利

job_welfare

job['positionAdvantage']

print(job_welfare

'')

写入数据库

add_Mysql(count,

job_title,

job_salary,

job_city,

job_experience,

job_education,

人人文库> 全部分类> 教育资料 > 课件下载

温馨提示

1. 本站所有资源如无特殊说明，都需要本地电脑安装OFFICE2007和PDF阅读器。图纸软件为CAD,CAXA,PROE,UG,SolidWorks等.压缩文件请下载最新的WinRAR软件解压。
2. 本站的文档不包含任何第三方提供的附件图纸等，如果需要附件，请联系上传者。文件的所有权益归上传用户所有。
3. 本站RAR压缩包中若带图纸，网页内容里面会有图纸预览，若没有图纸预览就没有图纸。
4. 未经权益所有人同意不得将文件中的内容挪作商业或盈利用途。
5. 人人文库网仅提供信息存储空间，仅对用户上传内容的表现方式做保护处理，对用户上传分享的文档内容本身不做任何修改或编辑，并不能对任何下载内容负责。
6. 下载文件中如有侵权或不适当内容，请与我们联系，我们立即纠正。
7. 本站不保证下载资源的准确性、安全性和完整性, 同时也不承担用户因使用这些下载资源对自己和他人造成任何形式的伤害或损失。

用Python分析哪些公司的数据分析岗值得去

文档简介

温馨提示

最新文档

评论

用Python分析哪些公司的数据分析岗值得去

文档简介

温馨提示

最新文档

评论

相关文档