简介:利用tidyverse包中的across和map函数及传统的for循环功能,试图实现统计分析中大量重复工作的批量化以及繁杂代码的精简化,目前只汇总了目前所用到的一些批量化程序,进一步汇总有待后续更新。。。
设置变量类型
#批量设置因子型/有序型/字符型/数值型变量
variables <-c("Group.1.vs..2","Group.1.vs..3")
variables <-colnames(adam[,1:14]) #两种设置方式都可
test <- d1 %>% mutate(across(any_of(variables),as.character))#factor可替换为ordered/as.character/as.numeric
连续型变量切分
#分位数切分
result1 <- result %>%mutate(across(any_of(cols_to_cut), ~cut(.,
breaks = c(min(.,na.rm=T)-1,quantile(., probs = c(0.25, 0.5, 0.75,1),na.rm = TRUE)),labels=F), .names = "{.col}class4"))
#等差切分
result3 <- result2 %>% mutate(across(any_of(cols_to_cut), ~cut(., breaks=c(min(.,na.rm=T)-1,min(.,na.rm=T)+diff(range(.,na.rm=T))/4,min(.,na.rm=T)+2*diff(range(.,na.rm=T))/4,min(.,na.rm=T)+3*diff(range(.,na.rm=T))/4,max(.,na.rm=T)),labels=F), .names = "{.col}classd4"))
#按给定的截断值切分
for (nm in predvars){
result2 <- result2 %>% mutate(across(
nm, ~cut(., breaks =c(min(.,na.rm=T)-1,thresholds[[match(nm, thresholds$predvar),1]],max(.,na.rm=T)),labels=F), .names = "{.col}class2"))}
批量计算列
#求日期间隔时间
for (i in 2:53){
diff_col_name <- paste("入院时间", i, sep = "") # 创建差值列的名称
result<- round((data1[, paste0("入院日期-", i)] - data1[,"入院日期-",i-1])/30,0) # 计算差值并存储
result <- as_tibble(result)
result <- set_names(result,diff_col_name)
dataresult<-bind_cols(dataresult,result)
}
#创建函数并用for循环选取变量实现批量计算
eGFR <- function(性别,年龄,血肌酐){
if (is.na(性别) || is.na(年龄) || is.na(血肌酐)) {
return(NA)}
else{
scr=血肌酐/88.4
if(性别==0&scr>0.9){eGFR=141*(scr/0.9)**-1.209*0.993**年龄}
else if(性别==0&scr<=0.9){eGFR=141*(scr/0.9)**-0.411*0.993**年龄}
else if(性别==1&scr>0.7){eGFR=144*(scr/0.7)**-1.209*0.993**年龄}
else if(性别==1&scr<=0.7){eGFR=144*(scr/0.7)**-0.329*0.993**年龄}
else{eGFR=NA}
return(eGFR)
}
}
for(i in 1:53){
diff_col_name <- paste("eGFR", i, sep = "") # 创建新列的名称
年龄 <- paste0("入院年龄",i)
血肌酐 <- paste0("首次随访-血肌酐Scr-",i)
result <- pmap_dbl(data1[,c("性别",年龄,血肌酐)], ~eGFR(..1,..2,..3))
result <- as_tibble(result)
result <- set_names(result,diff_col_name)
dataresult<-bind_cols(dataresult,result)
}
批量建模
#批量正态性检验
result %>% select(all_of(cols_to_cut)) %>% map(~shapiro.test(.))
variables <- c("weix7_1", "weix8_1", "weix9_1", "weix12_1", "weix13_1")
models <- list()
#批量回归建模
#1传统for循环
for (variable in variables) {
formula <- as.formula(paste("cl3bmi ~", variable, "+ ag117 + adoage + clnmeal + mac102 + mac301a + ado_exer + zhuado_pub + M_edu + F_edu + M_job + F_job + mother_age + ab106 + parity_c2 + pcaWI_C3 + treat_chris + ac203 + SGA_Inter_2021"))
model <- vglm(formula, family = multinomial(refLevel = "0"), data = result, cluster = ~ar06)
models[[variable]] <- model}
#2map循环
#2.1直接map
models <- map(variables, ~ vglm(formula = as.formula(paste("cl3bmi ~", ., "+ ag117 + adoage + clnmeal + mac102 + mac301a + ado_exer + zhuado_pub + M_edu + F_edu + M_job + F_job + mother_age + ab106 + parity_c2 + pcaWI_C3 + treat_chris + ac203 + SGA_Inter_2021")),family = multinomial(refLevel = "0"), data = result, cluster = ~ar06, na.omit = TRUE))
#2.2自定义函数map
models <- map(variables, function(x)
{formula <- as.formula(paste("cl3bmi ~", x, "+ ag117 + adoage + clnmeal + mac102 + mac301a + ado_exer + zhuado_pub + M_edu + F_edu + M_job + F_job + mother_age + ab106 + parity_c2 + pcaWI_C3 + treat_chris + ac203 + SGA_Inter_2021"))
vglm(formula, family = multinomial(refLevel = "0"), data = result, cluster = ~ar06, na.omit = TRUE)})
#批量建立roc模型
#二分类
for (i in 1:30) {
row_numbers <- as.numeric(rownames(prvalues[[i]]))
rocresult <- roc(result[row_numbers, ]$cl3bmi,prvalues[[i]])
rocresults[[i]] <- rocresult}
#三分类数据间隔建立roc(需将三分类观测值转化为两个二分类变量)
rocunderweight <- list()
rocobese <- list()
counter <- 1
for (nm in predvars) {
if (counter %% 2 == 1) {
roc_result <- roc(obesepredict$underweight, obesepredict[[nm]], ci = TRUE, auc = TRUE)
rocunderweight[nm] <- list(roc_result)}
counter <- counter + 1
}
for (nm in predvars) {
if (counter %% 2 == 0) {
roc_result <- roc(obesepredict$underweight, obesepredict[[nm]], ci = TRUE, auc = TRUE)
rocobese[nm] <- list(roc_result)}
counter <- counter + 1
}
模型处理
#批量求预测值
for (i in 1:30) {
prvalue <- predict(models[[i]], newdata = result, type = "response")
prvalues[[i]]<- prvalue}
#批量提取auc
aucresult1 <- tibble()
for(i in names(rocobese)){
result <- auc(rocobese[[i]])
idx <- match(i, names(rocobese)) #使用match函数将列名转换为索引
aucresult1[idx,"varname"] <- i
aucresult1[idx,"obeseauc"] <- as.numeric(result)}
#批量提取最佳截断值
thresholds <- tibble()
for (nm in predvars){
threshold <- coords(rocresult[[nm]], "best")
threshold$predvar <- nm
thresholds <- bind_rows(thresholds,threshold)}
结果信息处理并储存到嵌套列表
result_res <- list()for (i in names(result_dats)){
current_results <- list()
for(n in names(result_dats[[i]])){
if (nrow(result_dats[[i]][[n]])<3) { next }
res <- mr(result_dats[[i]][[n]])
res[,"dataid"] <- i
res[,"samplesize.exposure"] <- result_dats[[i]][[n]][1,"samplesize.exposure"] res[,"samplesize.outcome"] <- result_dats[[i]][[n]][1,"samplesize.outcome"]
res$lcl <- res$b - (1.96 * res$se)
res$ucl <- res$b + (1.96 * res$se)
res$or <- exp(res$b)
res$orlcl <- exp(res$lcl)
res$orucl <- exp(res$ucl)
current_results[[n]] <- res }
result_res[[i]] <- current_results }