简介:用r语言构建随机森林预测模型,筛选预测变量,调整模型参数
所用r包:randomforest、caret
构建初始模型
form_xy <- as.formula(
paste0("脑转移~",paste(colnames(train_x),collapse = "+"))
)
form_xy
#在训练集上构建RF初始模型
set.seed(123)
rf_model0 <- randomForest(form_xy,data = train_data,
nodesize=1,
importance=TRUE,
proximity=TRUE)
#模型概况
rf_model0
#模型可视化评估
tiff("ERROR_TREES_plot.tiff", width = 12, height = 8, units = "in", res = 600)
plot(rf_model0, main = "ERROR & TREES")
legend("top",
legend = colnames(rf_model0$err.rate),
lty = 1:3,
col = 1:3,
horiz = TRUE)
dev.off()
参数调优
#mtry数量调整
#方法1:循环计算
set.seed(123)
errRate <- c()
for (i in 1:30) {
m <- randomForest(脑转移~.,data = train_data,mtry=i,proximity=TRUE)
err <- mean(m$err.rate)
errRate[i] <- err}
print(errRate)
which.min(errRate)
#方法2:caret包十折交叉验证
set.seed(123)
trainControl <- trainControl(method = "cv",
summaryFunction=twoClassSummary,
number = 10,
classProbs = TRUE,
savePredictions = TRUE) #使用5折/10折交叉验证
rfFit <- train(脑转移~.,data=train_data,method="rf",metric="Accuracy",
trContro=trainControl )
rfFit
#ntree数量调整
#使用最佳mtry值固定mtry
ctrl <- trainControl(method = "cv", number = 10, search="grid")
bestgrid <- expand.grid(mtry = 20) # 上述方法求出的最佳特征数量
#定义模型列表,存储每一个模型评估结果
modellist <- list()
#调整的参数是决策树的数量
for (ntree in c(100,200,300,400,500)) {
set.seed(123)
fit <- train(x =train_x, y = train_y, method="rf",
metric="Accuracy", tuneGrid=bestgrid,
trControl=ctrl, ntree=ntree)
key <- toString(ntree)
modellist[[key]] <- fit
}
# compare results
results <- resamples(modellist)
# 输出最佳模型和参数
summary(results)
dotplot(results) #可视化不同ntree的结果
#使用最佳超参数重新训练模型
#本例设置mtry=6,ntree=100示例运行
set.seed(123)
rf_model2 <- randomForest(x = train_x, y = train_y,
mtry = 20,
ntree = 100,
nodesize=1,
importance=TRUE,
proximity=TRUE)
# 输出最终模型
print(rf_model2)
变量筛选
#查看各个变量的重要性
importance(rf_model2)
#Mean decrease accuracy和MeanDecreaseGini越大说明变量越重要
#如果是随机回归森林则对应%IncMSE和IncNodePurity
#可视化变量重要性
tiff("变量重要性.tiff", width = 6, height = 6, units = 'in', res = 600)
varImpPlot(rf_model2,
main = "varImpPlot",
n.var=11,#Top变量个数
scale=T,#显示横坐标
cex = 1#字体大小
)
dev.off()
#分别输出
#type=1是以准确率递减方法得到维度重要性值,type=2为基尼系数方法
varImpPlot(rf_model2, main = "variable importance",type=1)
varImpPlot(rf_model2,main = "variable importance",type=2)
#根据变量重要性排序可选择变量。通常选择前5个,前10个,或者大于所有变量性平均值(中位数,百分位数等)的变量等
# 使用caret包的递归特征消除
library(caret)
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
results <- rfe(train_x, train_y, sizes=c(1:10), rfeControl=control)
# 最优特征数量
print(results)
# 绘制特征数量与模型性能的关系图
tiff("递归特征消除.tiff", width = 8, height = 8, units = 'in', res = 600)
plot(results, type=c("g", "o"))
dev.off()
模型评估
#训练集模型预测:混淆矩阵和ROC曲线
#训练集预测概率
predict_train <- predict(rf_model2,newdata = train_data)
#训练集预测结果
matrix_train <- table(train_y,predict_train)
#训练集混淆矩阵
confusionMatrix_train <- confusionMatrix(data = predict_train,
reference = train_data$脑转移,
positive = "是",
mode = "everything")
#输出训练集模型评价指标:准确率(accuracy),精确率(precision),召回率(recall),F1值
print(confusionMatrix_train)
#绘制训练集ROC曲线
#训练集预测概率
train_predprob <- predict(rf_model2,newdata = train_data,type="prob")
#AUC
train_roc <- roc(response = train_data$脑转移, predictor = train_predprob[,2])
train_roc
#截断值
train_roc_best=coords(train_roc, "best",best.method = c("youden"),
ret=c("threshold","sensitivity", "specificity"))
train_roc_best
模型预测效果绘图
#绘制变量重要性直方图
tiff("特征重要性排序.tiff", width = 8, height = 10, units = 'in', res = 600)
importance_scores <- importance(rf_model2)
feature_importance <- data.frame(variable = rownames(importance_scores), Importance = importance_scores[, "MeanDecreaseGini"])
ordered_features <- feature_importance[order(-feature_importance$Importance), ]
#设置渐变颜色范围
color_range <- c("#C8EBF6", "#154778")
#绘制横向柱状图
ggplot(ordered_features, aes(x = reorder(variable, Importance), y = Importance, fill = Importance, label = round(Importance, 2))) +
geom_bar(stat = "identity") +
geom_text(size = 3, position = position_stack(vjust = 0.5), color = "black") + # 添加标签
scale_fill_gradient(low = color_range[1], high = color_range[2]) + # 使用渐变填充
theme(
plot.title = element_text(hjust = 0.5, size = 16),
plot.caption = element_text(size = 12),
axis.text = element_text(size = 12),
axis.title = element_text(size = 15)
) +
labs(title = "Important Variables", x = "Variables", y = "Importance") +
coord_flip()
dev.off()
#绘制混淆矩阵图
#本例以训练集混淆矩阵进行示例,验证集方法相同,替换即可
#混淆矩阵转换为数据框
confusion_matrix_df2 <- as.data.frame.matrix(confusionMatrix_train$table)
colnames(confusion_matrix_df2) <- c("sensoring","terminal event")
rownames(confusion_matrix_df2) <- c("sensoring","terminal event")
draw_data2 <- round(confusion_matrix_df2 / rowSums(confusion_matrix_df2),2)
draw_data2$real <- rownames(draw_data2)
draw_data2 <- melt(draw_data2)
#绘制训练集混淆矩阵热图
ggplot(draw_data2, aes(real,variable, fill = value)) +
geom_tile() +
geom_text(aes(label = scales::percent(value))) +
scale_fill_gradient(low = "#ECA9B0", high = "#81D8D0") +
labs(x = "True", y = "Predicted", title = "Confusion matrix of train set") +
theme_prism(border = T)+
theme(panel.border = element_blank(),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
legend.position="none")
train_predprob <- predict(rf_model2,newdata = train_data,type="prob")
#绘制ROC曲线
train_roc <- roc(response = train_data$脑转移, predictor = train_predprob[,2])
train_roc
#截断值
train_roc_best=coords(train_roc, "best",best.method = c("youden"),
ret=c("threshold","sensitivity", "specificity"))
train_roc_best
#计算训练集ROC曲线的参数
train_roc_obj <- train_roc
train_roc_auc <- auc(train_roc_obj)
#将ROC对象转换为数据框
train_roc_data <- data.frame(train_roc_obj$specificities, train_roc_obj$sensitivities)
#ggplot2包美化ROC
ggplot(train_roc_data, aes(x = train_roc_obj$specificities, y = train_roc_obj$sensitivities)) +
geom_line(color = "#2089CB", size = 1) +
geom_text(aes(x = 0.25, y = 0.25, label = paste("AUC =", round(train_roc_auc, 2))), size = 8, color = "#345D82") +
geom_text(aes(x = 0.7, y = 0.85, label = paste("Cutoff =", round(train_roc_best[,1], 2))), size = 6, color = "#D2431C") +
coord_cartesian(xlim = c(1, 0), ylim = c(0, 1)) +
theme_pubr() +
labs(x = "Specificity", y = "Sensitivity") +
ggtitle("ROC Curve of train set") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))+
geom_segment(aes(x = 1, y = 0, xend = 0, yend = 1),color="#5d6174",linetype="dashed") +
theme_prism(border = T)+
geom_point(x= train_roc_best$specificity ,y=train_roc_best$sensitivity,color="#D2431C",size=3)+
theme(axis.text = element_text (size = 10))+
theme(axis.title.x=element_text(vjust=2, size=15,face = "plain"))+
theme(axis.title.y=element_text(vjust=2, size=15,face = "plain"))
#绘制训练集校准曲线
#本例使用rms包中的val.prob()函数
train_cv <- val.prob(train_predprob [, 2],
as.numeric(train_data$脑转移)-1,
logistic.cal = FALSE,
statloc = F,
riskdist = c("predicted","calibrated"),
legendloc = c(0.8,0.25))
title(main = "calibration curve of train")