R语言随机森林

您所在的位置：网站首页 › r语言随机森林图 › R语言随机森林

R语言随机森林

2023-06-06 00:18| 来源: 网络整理| 查看: 265

关注微信公共号：小程在线

关注CSDN博客：程志伟的博客

R版本：3.6.1

randomForest包：提供randomForest()函数用于随机森林的建立

rflmpute()函数：对数据缺失值进行插补

treesize()函数：查看模型每颗决策树的节点数

importtance()：提取模型中各变量对模型的重要性

> setwd('G:\\R语言\\大三下半年\\数据挖掘：R语言实战\\') > library('randomForest') > set.seed(4) > data(mtcars) > mtcars.rf=randomForest(mpg~.,data=mtcars,ntree=1000,importance=TRUE)

#提取模型中的重要值 > importance(mtcars.rf) %IncMSE IncNodePurity cyl 16.151445 154.16459 disp 18.833040 255.10218 hp 18.641110 201.42227 drat 6.343488 65.96680 wt 19.987072 247.29443 qsec 4.656151 30.95240 vs 5.627916 27.14099 am 4.064642 15.18171 gear 5.825897 20.12545 carb 9.383633 31.03605 > #MDSplot函数绘制坐标图 > set.seed(1) > data(iris) > iris.rf=randomForest(Species~.,iris,proximity=T) > MDSplot(iris.rf,iris$Species,palette=rep(1,3),pch=as.numeric(iris$Species))

#rflmpute()函数可以对缺失值进行插值。 > data("iris") > iris.na=iris > iris.na[75,2]=NA;iris.na[125,3]=NA; > set.seed(111) > iris.imputed=rfImpute(Species~.,data=iris.na) ntree OOB 1 2 3 300: 4.67% 0.00% 6.00% 8.00% ntree OOB 1 2 3 300: 4.67% 0.00% 6.00% 8.00% ntree OOB 1 2 3 300: 4.00% 0.00% 6.00% 6.00% ntree OOB 1 2 3 300: 4.67% 0.00% 6.00% 8.00% ntree OOB 1 2 3 300: 4.67% 0.00% 6.00% 8.00%

#通过对缺失值插补，可以看出非常接近实际值 > list("real"=iris[c(75,125),1:4],"have-NA"=iris.na[c(75,125),1:4], + "disposed"=round(iris.imputed[c(75,125),2:5],1)) $real Sepal.Length Sepal.Width Petal.Length Petal.Width 75 6.4 2.9 4.3 1.3 125 6.7 3.3 5.7 2.1

$`have-NA` Sepal.Length Sepal.Width Petal.Length Petal.Width 75 6.4 NA 4.3 1.3 125 6.7 3.3 NA 2.1

$disposed Sepal.Length Sepal.Width Petal.Length Petal.Width 75 6.4 2.8 4.3 1.3 125 6.7 3.3 5.6 2.1

#treesize()函数可以查看随机森林中决策树的个数 > iris.rf hist(treesize(iris.rf))

#可视化分析 > data(airquality) > set.seed(131) > ozone.rf=randomForest(Ozone~.,data=airquality,mtry=3,importance=T, + na.action=na.omit) > plot(ozone.rf) #模型误差在210之后没有太大的变化

############## 应用案例 #################

> wine=read.csv("G:\\R语言\\大三下半年\\数据挖掘：R语言实战\\数据挖掘：R语言实战（案例数据集）\\11 随机森林\\winequality-white.csv",header=T,sep = ";",na.strings="null") > summary(wine) fixed.acidity volatile.acidity citric.acid residual.sugar Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700 Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200 Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900 Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800 chlorides free.sulfur.dioxide total.sulfur.dioxide Min. :0.00900 Min. : 2.00 Min. : 9.0 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0 Median :0.04300 Median : 34.00 Median :134.0 Mean :0.04577 Mean : 35.31 Mean :138.4 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0 Max. :0.34600 Max. :289.00 Max. :440.0 density pH sulphates alcohol Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 Median :0.9937 Median :3.180 Median :0.4700 Median :10.40 Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20 quality Min. :3.000 1st Qu.:5.000 Median :6.000 Mean :5.878 3rd Qu.:6.000 Max. :9.000

#设置中间变量对处理后的向量进行临时存储 > for(i in 1:4898)#只对每一个样本进行调整 + { + if(wine[i,12]>6)cha[i]="good" + else if(wine[i,12]>5)cha[i]="mid" + else cha[i]="bad" + } > wine[,12]=factor(cha)#将字符型变量转化为含有因子的变量赋值给数据集wine > summary(wine$quality) bad good mid 1640 1060 2198

#利用第一种格式 > set.seed(71)#设置随机数生成器初始值 > samp=sample(1:4898,3000) > set.seed(111) > wine.rf=randomForest(quality~.,data=wine,importance=TRUE,proximity=TRUE,ntree=500,subest=samp)#构建决策树为500棵的随机森林模型

#利用第二种格式 > x=subset(wine,select=-quality)#除quality以外的数据为自变量 > y=wine$quality#提取quality为响应变量 > set.seed(71) > samp=sample(1:4898,3000) > xr=x[samp,];yr=y[samp] > set.seed(111) > wine.rf=randomForest(xr,yr,importance=TRUE,proximity=TRUE,ntree=500)

#输出模型

#Type of random forest: classification，表示该模型为判别模型

#Number of trees：包含500颗决策树

#No. of variables tried at each split:表示没颗决策树节点处所选择的变量个数为3

#OOB estimate of error rate：模型的误差为30.57%

> print(wine.rf)

Call: randomForest(x = xr, y = yr, ntree = 500, importance = TRUE, proximity = TRUE) Type of random forest: classification Number of trees: 500 No. of variables tried at each split: 3

OOB estimate of error rate: 30.57% Confusion matrix: bad good mid class.error bad 697 21 283 0.3036963 good 12 392 238 0.3894081 mid 227 136 994 0.2675018

#对模型提取重要值列表 > importance(wine.rf) bad good mid MeanDecreaseAccuracy fixed.acidity 38.39220 38.04701 28.23129 54.35687 volatile.acidity 60.62095 57.14363 47.73287 81.72092 citric.acid 33.95011 37.38719 27.88769 46.44210 residual.sugar 31.46575 36.43637 35.79771 55.19329 chlorides 44.52846 46.47096 27.53577 56.75493 free.sulfur.dioxide 45.25569 41.41413 34.22959 64.04598 total.sulfur.dioxide 34.41994 41.26107 27.18189 50.61239 density 31.79573 42.52763 32.02053 53.63531 pH 33.29907 44.35856 26.93746 51.25421 sulphates 31.49062 36.61428 28.59253 50.10378 alcohol 66.08129 66.88405 32.11592 82.88410 MeanDecreaseGini fixed.acidity 138.8276 volatile.acidity 195.5095 citric.acid 149.4469 residual.sugar 163.2376 chlorides 169.0678 free.sulfur.dioxide 178.9195 total.sulfur.dioxide 170.5083 density 204.5551 pH 158.8621 sulphates 145.9429 alcohol 240.0514

varImpPlot(wine.rf, main = "variable importance")

#优化模型 > mtry_error=0 > for(i in 1:(ncol(wine)-1)) + { + set.seed(100) + newModel=randomForest(quality~.,data=wine,mtry=i,importance=TRUE,ntree=1000) + mtry_error[i]=mean(newModel$err.rate) + } > mtry_error [1] 0.2641172 0.2644469 0.2655955 0.2648887 0.2635614 0.2658425 0.2640577 [8] 0.2657842 0.2668288 0.2663740 0.2700658

#通过上面的分析，节点个数为5错误率最小

> plot(mtry_error,xlab = "mtry",ylab = "error",type="l")

> set.seed(222) > Model=randomForest(quality~.,data=wine,mtry=5,importance=TRUE,ntree=1000) > plot(model)#绘制模型误差与决策树数量关系图

#通过上面的分析，数量为400，节点个数为5 > set.seed(222) > Model=randomForest(quality~.,data=wine,mtry=5,importance=TRUE,ntree=400) > print(Model)

Call: randomForest(formula = quality ~ ., data = wine, mtry = 5, importance = TRUE, ntree = 400) Type of random forest: classification Number of trees: 400 No. of variables tried at each split: 5

OOB estimate of error rate: 25.36% Confusion matrix: bad good mid class.error bad 1227 16 397 0.2518293 good 25 703 332 0.3367925 mid 294 178 1726 0.2147407 > hist(treesize(Model))#展示随机森林中每棵树决策树的节点数

【本文地址】

R语言随机森林

R语言随机森林

今日新闻

推荐新闻

R语言 随机森林

R语言 随机森林

今日新闻

推荐新闻

R语言随机森林

R语言随机森林