数据挖掘 随机森林_第1页
数据挖掘 随机森林_第2页
数据挖掘 随机森林_第3页
数据挖掘 随机森林_第4页
数据挖掘 随机森林_第5页
已阅读5页,还剩13页未读 继续免费阅读

下载本文档

版权说明:本文档由用户提供并上传,收益归属内容提供方,若内容存在侵权,请进行举报或认领

文档简介

随机森林junjun2016年2月8日随机森林实例Markdown脚本及数据集:/s/1bnY6ar9实例一、用随机森林对鸢尾花数据进行分类#1、加载数据并查看data("iris")summary(iris)##Sepal.LengthSepal.WidthPetal.LengthPetal.Width##Min.:4.300Min.:2.000Min.:1.000Min.:0.100##1stQu.:5.1001stQu.:2.8001stQu.:1.6001stQu.:0.300##Median:5.800Median:3.000Median:4.350Median:1.300##Mean:5.843Mean:3.057Mean:3.758Mean:1.199##3rdQu.:6.4003rdQu.:3.3003rdQu.:5.1003rdQu.:1.800##Max.:7.900Max.:4.400Max.:6.900Max.:2.500##Species##setosa:50##versicolor:50##virginica:50######str(iris)##'data.frame':150obs.of5variables:##$Sepal.Length:num4.655.44.654.44.9...##$Sepal.Width:num3.52.93.1...##$Petal.Length:num1.5...##$Petal.Width:num0.1...##$Species:Factorw/3levels"setosa","versicolor",..:1111111111...#2、创建训练集和测试集数据set.seed(2001)library(caret)##Loadingrequiredpackage:lattice##Loadingrequiredpackage:ggplot2##Warning:package'ggplot2'wasbuiltunderRversion3.2.3index<-createDataPartition(iris$Species,p=0.7,list=F)train_iris<-iris[index,]test_iris<-iris[-index,]#3、建模library(randomForest)##randomForest4.6-12##TyperfNews()toseenewfeatures/changes/bugfixes.####Attachingpackage:'randomForest'##Thefollowingobjectismaskedfrom'package:ggplot2':####marginmodel_iris<-randomForest(Species~.,data=train_iris,ntree=50,nPerm=10,mtry=3,proximity=T,importance=T)#4、模型评估model_iris####Call:##randomForest(formula=Species~.,data=train_iris,ntree=50,nPerm=10,mtry=3,proximity=T,importance=T)##Typeofrandomforest:classification##Numberoftrees:50##No.ofvariablestriedateachsplit:3####OOBestimateoferrorrate:4.76%##Confusionmatrix:##setosaversicolorvirginicaclass.error##setosa35000.00000000##versicolor03230.08571429##virginica02330.05714286str(model_iris)##Listof19##$call:languagerandomForest(formula=Species~.,data=train_iris,ntree=50,nPerm=10,mtry=3,proximity=T,importance=T)##$type:chr"classification"##$predicted:Factorw/3levels"setosa","versicolor",..:1111111111...##..-attr(*,"names")=chr[1:105]"5""7""8""11"...##$err.rate:num[1:50,1:4]0.05130.07580.07410.04350.0505...##..-attr(*,"dimnames")=Listof2##....$:NULL##....$:chr[1:4]"OOB""setosa""versicolor""virginica"##$confusion:num[1:3,1:4]3500032203330...##..-attr(*,"dimnames")=Listof2##....$:chr[1:3]"setosa""versicolor""virginica"##....$:chr[1:4]"setosa""versicolor""virginica""class.error"##$votes:matrix[1:105,1:3]1111111111...##..-attr(*,"dimnames")=Listof2##....$:chr[1:105]"5""7""8""11"...##....$:chr[1:3]"setosa""versicolor""virginica"##..-attr(*,"class")=chr[1:2]"matrix""votes"##$oob.times:num[1:105]15232216171120201719...##$classes:chr[1:3]"setosa""versicolor""virginica"##$importance:num[1:4,1:5]000.34170.34918-0.00518...##..-attr(*,"dimnames")=Listof2##....$:chr[1:4]"Sepal.Length""Sepal.Width""Petal.Length""Petal.Width"##....$:chr[1:5]"setosa""versicolor""virginica""MeanDecreaseAccuracy"...##$importanceSD:num[1:4,1:4]000.045640.047110.00395...##..-attr(*,"dimnames")=Listof2##....$:chr[1:4]"Sepal.Length""Sepal.Width""Petal.Length""Petal.Width"##....$:chr[1:4]"setosa""versicolor""virginica""MeanDecreaseAccuracy"##$localImportance:NULL##$proximity:num[1:105,1:105]1111111111...##..-attr(*,"dimnames")=Listof2##....$:chr[1:105]"5""7""8""11"...##....$:chr[1:105]"5""7""8""11"...##$ntree:num50##$mtry:num3##$forest:Listof14##..$ndbigtree:int[1:50]1159999911119...##..$nodestatus:int[1:17,1:50]1-1111-1-11-1-1...##..$bestvar:int[1:17,1:50]4043300100...##..$treemap:int[1:17,1:2,1:50]20468001000...##..$nodepred:int[1:17,1:50]0100023032...##..$xbestsplit:num[1:17,1:50]0.801.655.254.85006.0500...##..$pid:num[1:3]111##..$cutoff:num[1:3]0.3330.3330.333##..$ncat:Namedint[1:4]1111##....-attr(*,"names")=chr[1:4]"Sepal.Length""Sepal.Width""Petal.Length""Petal.Width"##..$maxcat:int1##..$nrnodes:int17##..$ntree:num50##..$nclass:int3##..$xlevels:Listof4##....$Sepal.Length:num0##....$Sepal.Width:num0##....$Petal.Length:num0##....$Petal.Width:num0##$y:Factorw/3levels"setosa","versicolor",..:1111111111...##..-attr(*,"names")=chr[1:105]"5""7""8""11"...##$test:NULL##$inbag:NULL##$terms:Classes'terms','formula'length3Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width##....-attr(*,"variables")=languagelist(Species,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width)##....-attr(*,"factors")=int[1:5,1:4]0100000100...##......-attr(*,"dimnames")=Listof2##........$:chr[1:5]"Species""Sepal.Length""Sepal.Width""Petal.Length"...##........$:chr[1:4]"Sepal.Length""Sepal.Width""Petal.Length""Petal.Width"##....-attr(*,"term.labels")=chr[1:4]"Sepal.Length""Sepal.Width""Petal.Length""Petal.Width"##....-attr(*,"order")=int[1:4]1111##....-attr(*,"intercept")=num0##....-attr(*,"response")=int1##....-attr(*,".Environment")=<environment:R_GlobalEnv>##....-attr(*,"predvars")=languagelist(Species,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width)##....-attr(*,"dataClasses")=Namedchr[1:5]"factor""numeric""numeric""numeric"...##......-attr(*,"names")=chr[1:5]"Species""Sepal.Length""Sepal.Width""Petal.Length"...##-attr(*,"class")=chr[1:2]"randomForest.formula""randomForest"pred<-predict(model_iris,train_iris)mean(pred==train_iris[,5])##[1]1#5、预测pred_iris<-predict(model_iris,test_iris)table(pred_iris,test_iris[,5])####pred_irissetosaversicolorvirginica##setosa1500##versicolor0132##virginica0213mean(pred_iris==test_iris[,5])##[1]0.9111111library(gmodels)CrossTable(pred_iris,test_iris[,5])######CellContents##|-------------------------|##|N|##|Chi-squarecontribution|##|N/RowTotal|##|N/ColTotal|##|N/TableTotal|##|-------------------------|######TotalObservationsinTable:45######|test_iris[,5]##pred_iris|setosa|versicolor|virginica|RowTotal|##-------------|------------|------------|------------|------------|##setosa|15|0|0|15|##|20.000|5.000|5.000||##|1.000|0.000|0.000|0.333|##|1.000|0.000|0.000||##|0.333|0.000|0.000||##-------------|------------|------------|------------|------------|##versicolor|0|13|2|15|##|5.000|12.800|1.800||##|0.000|0.867|0.133|0.333|##|0.000|0.867|0.133||##|0.000|0.289|0.044||##-------------|------------|------------|------------|------------|##virginica|0|2|13|15|##|5.000|1.800|12.800||##|0.000|0.133|0.867|0.333|##|0.000|0.133|0.867||##|0.000|0.044|0.289||##-------------|------------|------------|------------|------------|##ColumnTotal|15|15|15|45|##|0.333|0.333|0.333||##-------------|------------|------------|------------|------------|####实例二、用坦泰尼克号乘客是否存活数据应用到随机森林算法中在随机森林算法的函数randomForest()中有两个非常重要的参数,而这两个参数又将影响模型的准确性,它们分别是mtry和ntree。一般对mtry的选择是逐一尝试,直到找到比较理想的值,ntree的选择可通过图形大致判断模型内误差稳定时的值。randomForest包中的randomForest(formula,data,ntree,nPerm,mtry,proximity,importace)函数:随机森林分类与回归。ntree表示生成决策树的数目(不应设置太小,默认为500);nPerm表示计算importance时的重复次数,数量大于1给出了比较稳定的估计,但不是很有效(目前只实现了回归);mtry表示选择的分裂属性的个数;proximity表示是否生成邻近矩阵,为T表示生成邻近矩阵;importance表示输出分裂属性的重要性。下面使用坦泰尼克号乘客是否存活数据应用到随机森林算法中,看看模型的准确性如何。#1、加载数据并查看:同时读取训练样本和测试样本集train<-read.table("F:\\R\\Rworkspace\\RandomForest/train.csv",header=T,sep=",")test<-read.table("F:\\R\\Rworkspace\\RandomForest/test.csv",header=T,sep=",")#注意:训练集和测试集数据来自不同的数据集,一定要注意测试集和训练集的factor的levels相同,否则,在利用训练集训练的模型对测试集进行预测时,会报错!!!str(train)##'data.frame':891obs.of8variables:##$Survived:int0111000011...##$Pclass:int3131331332...##$Sex:Factorw/2levels"female","male":2111222211...##$Age:num2238263535NA5422714...##$SibSp:int1101000301...##$Parch:int0000000120...##$Fare:num7.2571.287.9253.18.05...##$Embarked:Factorw/4levels"","C","Q","S":4244434442...str(test)##'data.frame':418obs.of7variables:##$Pclass:int3323333233...##$Sex:Factorw/2levels"female","male":2122121212...##$Age:num34.5476227221430261821...##$SibSp:int0100100102...##$Parch:int0000100100...##$Fare:num7.8379.698.6612.29...##$Embarked:Factorw/3levels"C","Q","S":2323332313...#从上可知:训练集数据共891条记录,8个变量,Embarked因子水平为4;测试集数据共418条记录,7个变量,Embarked因子水平为3;训练集中存在缺失数据;Survived因变量为数字类型,测试集数据无因变量#2、数据清洗#1)调整测试集与训练基地因子水平levels(train$Embarked)##[1]"""C""Q""S"levels(test$Embarked)##[1]"C""Q""S"levels(test$Embarked)<-levels(train$Embarked)#2)把因变量转化为因子类型train$Survived<-as.factor(train$Survived)#3)使用rfImpute()函数补齐训练集的缺失值NAlibrary(randomForest)train_impute<-rfImpute(Survived~.,data=train)##ntreeOOB12##300:16.39%7.83%30.12%##ntreeOOB12##300:16.50%8.93%28.65%##ntreeOOB12##300:16.72%8.74%29.53%##ntreeOOB12##300:16.50%8.56%29.24%##ntreeOOB12##300:17.28%9.47%29.82%#4)补齐测试集的缺失值:对待测样本进行预测,发现待测样本中存在缺失值,这里使用多重插补法将缺失值补齐summary(test)##PclassSexAgeSibSp##Min.:1.000female:152Min.:0.17Min.:0.0000##1stQu.:1.000male:2661stQu.:21.001stQu.:0.0000##Median:3.000Median:27.00Median:0.0000##Mean:2.266Mean:30.27Mean:0.4474##3rdQu.:3.0003rdQu.:39.003rdQu.:1.0000##Max.:3.000Max.:76.00Max.:8.0000##NA's:86##ParchFareEmbarked##Min.:0.0000Min.:0.000:102##1stQu.:0.00001stQu.:7.896C:46##Median:0.0000Median:14.454Q:270##Mean:0.3923Mean:35.627S:0##3rdQu.:0.00003rdQu.:31.500##Max.:9.0000Max.:512.329##NA's:1#可是看出测试集数据存在缺失值NA,Age和Fare的数据有NA#多重插补法填充缺失值:library(mice)##Loadingrequiredpackage:Rcpp##mice2.252015-11-09imput<-mice(data=test,m=10)####iterimpvariable##11AgeFare##12AgeFare##13AgeFare##14AgeFare##15AgeFare##16AgeFare##17AgeFare##18AgeFare##19AgeFare##110AgeFare##21AgeFare##22AgeFare##23AgeFare##24AgeFare##25AgeFare##26AgeFare##27AgeFare##28AgeFare##29AgeFare##210AgeFare##31AgeFare##32AgeFare##33AgeFare##34AgeFare##35AgeFare##36AgeFare##37AgeFare##38AgeFare##39AgeFare##310AgeFare##41AgeFare##42AgeFare##43AgeFare##44AgeFare##45AgeFare##46AgeFare##47AgeFare##48AgeFare##49AgeFare##410AgeFare##51AgeFare##52AgeFare##53AgeFare##54AgeFare##55AgeFare##56AgeFare##57AgeFare##58AgeFare##59AgeFare##510AgeFareAge<-data.frame(Age=apply(imput$imp$Age,1,mean))Fare<-data.frame(Fare=apply(imput$imp$Fare,1,mean))#添加行标号:test$Id<-s(test)Age$Id<-s(Age)Fare$Id<-s(Fare)#替换缺失值:test[test$Id%in%Age$Id,'Age']<-Age$Agetest[test$Id%in%Fare$Id,'Fare']<-Fare$Faresummary(test)##PclassSexAgeSibSp##Min.:1.000female:152Min.:0.17Min.:0.0000##1stQu.:1.000male:2661stQu.:22.001stQu.:0.0000##Median:3.000Median:26.19Median:0.0000##Mean:2.266Mean:29.41Mean:0.4474##3rdQu.:3.0003rdQu.:36.653rdQu.:1.0000##Max.:3.000Max.:76.00Max.:8.0000##ParchFareEmbarkedId##Min.:0.0000Min.:0.000:102Length:418##1stQu.:0.00001stQu.:7.896C:46Class:character##Median:0.0000Median:14.454Q:270Mode:character##Mean:0.3923Mean:35.583S:0##3rdQu.:0.00003rdQu.:31.472##Max.:9.0000Max.:512.329#从上可知:测试数据集中已经没有了NA值。#3、选着随机森林的mtry和ntree值#1)选着mtry(n<-length(names(train)))##[1]8library(randomForest)for(iin1:n){model<-randomForest(Survived~.,data=train_impute,mtry=i)err<-mean(model$err.rate)print(err)}##[1]0.2100028##[1]0.1889116##[1]0.1776607##[1]0.1902606##[1]0.1960938##[1]0.1953451##[1]0.1951303##[1]0.2018745#从上可知:mtry=2或者mtry=3时,模型内评价误差最小,故确定参数mtry=2或者mtry=3#2)选着ntreeset.seed(2002)model<

温馨提示

  • 1. 本站所有资源如无特殊说明,都需要本地电脑安装OFFICE2007和PDF阅读器。图纸软件为CAD,CAXA,PROE,UG,SolidWorks等.压缩文件请下载最新的WinRAR软件解压。
  • 2. 本站的文档不包含任何第三方提供的附件图纸等,如果需要附件,请联系上传者。文件的所有权益归上传用户所有。
  • 3. 本站RAR压缩包中若带图纸,网页内容里面会有图纸预览,若没有图纸预览就没有图纸。
  • 4. 未经权益所有人同意不得将文件中的内容挪作商业或盈利用途。
  • 5. 人人文库网仅提供信息存储空间,仅对用户上传内容的表现方式做保护处理,对用户上传分享的文档内容本身不做任何修改或编辑,并不能对任何下载内容负责。
  • 6. 下载文件中如有侵权或不适当内容,请与我们联系,我们立即纠正。
  • 7. 本站不保证下载资源的准确性、安全性和完整性, 同时也不承担用户因使用这些下载资源对自己和他人造成任何形式的伤害或损失。

评论

0/150

提交评论