R语言小贴士(2)–批量化

简介:利用tidyverse包中的across和map函数及传统的for循环功能,试图实现统计分析中大量重复工作的批量化以及繁杂代码的精简化,目前只汇总了目前所用到的一些批量化程序,进一步汇总有待后续更新。。。

设置变量类型

#批量设置因子型/有序型/字符型/数值型变量
variables <-c("Group.1.vs..2","Group.1.vs..3") 
variables <-colnames(adam[,1:14])  #两种设置方式都可
test <- d1 %>% mutate(across(any_of(variables),as.character))#factor可替换为ordered/as.character/as.numeric

连续型变量切分

 #分位数切分
result1 <- result %>%mutate(across(any_of(cols_to_cut), ~cut(., 
breaks = c(min(.,na.rm=T)-1,quantile(., probs = c(0.25, 0.5, 0.75,1),na.rm = TRUE)),labels=F), .names = "{.col}class4"))
 #等差切分
result3 <- result2 %>% mutate(across(any_of(cols_to_cut), ~cut(., breaks=c(min(.,na.rm=T)-1,min(.,na.rm=T)+diff(range(.,na.rm=T))/4,min(.,na.rm=T)+2*diff(range(.,na.rm=T))/4,min(.,na.rm=T)+3*diff(range(.,na.rm=T))/4,max(.,na.rm=T)),labels=F), .names = "{.col}classd4"))
#按给定的截断值切分
for (nm in predvars){ 
result2 <- result2 %>% mutate(across(
  nm, ~cut(., breaks =c(min(.,na.rm=T)-1,thresholds[[match(nm, thresholds$predvar),1]],max(.,na.rm=T)),labels=F), .names = "{.col}class2"))}

批量计算列

#求日期间隔时间
for (i in 2:53){
  diff_col_name <- paste("入院时间", i, sep = "") # 创建差值列的名称
  result<- round((data1[, paste0("入院日期-", i)] - data1[,"入院日期-",i-1])/30,0) # 计算差值并存储
  result <- as_tibble(result)
  result <- set_names(result,diff_col_name)
  dataresult<-bind_cols(dataresult,result)
}

#创建函数并用for循环选取变量实现批量计算
eGFR <- function(性别,年龄,血肌酐){
  if (is.na(性别) || is.na(年龄) || is.na(血肌酐)) {
    return(NA)}
  else{
  scr=血肌酐/88.4
  if(性别==0&scr>0.9){eGFR=141*(scr/0.9)**-1.209*0.993**年龄}
    else if(性别==0&scr<=0.9){eGFR=141*(scr/0.9)**-0.411*0.993**年龄}
      else if(性别==1&scr>0.7){eGFR=144*(scr/0.7)**-1.209*0.993**年龄}
       else if(性别==1&scr<=0.7){eGFR=144*(scr/0.7)**-0.329*0.993**年龄}
        else{eGFR=NA}
  return(eGFR)
 }
}
for(i in 1:53){
diff_col_name <- paste("eGFR", i, sep = "") # 创建新列的名称
年龄 <- paste0("入院年龄",i)
血肌酐 <- paste0("首次随访-血肌酐Scr-",i)
result <- pmap_dbl(data1[,c("性别",年龄,血肌酐)], ~eGFR(..1,..2,..3))
result <- as_tibble(result)
result <- set_names(result,diff_col_name)
dataresult<-bind_cols(dataresult,result)
}

批量建模

#批量正态性检验
result %>% select(all_of(cols_to_cut)) %>% map(~shapiro.test(.))
variables <- c("weix7_1", "weix8_1", "weix9_1", "weix12_1", "weix13_1")
models <- list()

#批量回归建模
#1传统for循环
for (variable in variables) {
  formula <- as.formula(paste("cl3bmi ~", variable, "+ ag117 + adoage + clnmeal + mac102 + mac301a + ado_exer + zhuado_pub + M_edu + F_edu + M_job + F_job + mother_age + ab106 + parity_c2 + pcaWI_C3 + treat_chris + ac203 + SGA_Inter_2021"))
  model <- vglm(formula, family = multinomial(refLevel = "0"), data = result, cluster = ~ar06)
  models[[variable]] <- model}
#2map循环
#2.1直接map
models <- map(variables, ~ vglm(formula = as.formula(paste("cl3bmi ~", ., "+ ag117 + adoage + clnmeal + mac102 + mac301a + ado_exer + zhuado_pub + M_edu + F_edu + M_job + F_job + mother_age + ab106 + parity_c2 + pcaWI_C3 + treat_chris + ac203 + SGA_Inter_2021")),family = multinomial(refLevel = "0"), data = result, cluster = ~ar06, na.omit = TRUE))
#2.2自定义函数map
models <- map(variables, function(x)
{formula <- as.formula(paste("cl3bmi ~", x, "+ ag117 + adoage + clnmeal + mac102 + mac301a + ado_exer + zhuado_pub + M_edu + F_edu + M_job + F_job + mother_age + ab106 + parity_c2 + pcaWI_C3 + treat_chris + ac203 + SGA_Inter_2021"))
vglm(formula, family = multinomial(refLevel = "0"), data = result, cluster = ~ar06, na.omit = TRUE)})

#批量建立roc模型
#二分类
for (i in 1:30) {
  row_numbers <- as.numeric(rownames(prvalues[[i]]))
  rocresult <- roc(result[row_numbers, ]$cl3bmi,prvalues[[i]])
  rocresults[[i]] <- rocresult}

#三分类数据间隔建立roc(需将三分类观测值转化为两个二分类变量)
rocunderweight <- list()
rocobese <- list()
counter <- 1
for (nm in predvars) {
  if (counter %% 2 == 1) {
  roc_result <- roc(obesepredict$underweight, obesepredict[[nm]], ci = TRUE, auc = TRUE)
  rocunderweight[nm] <- list(roc_result)}
  counter <- counter + 1
}
for (nm in predvars) {
  if (counter %% 2 == 0) {
  roc_result <- roc(obesepredict$underweight, obesepredict[[nm]], ci = TRUE, auc = TRUE)
  rocobese[nm] <- list(roc_result)}
  counter <- counter + 1
}

模型处理

#批量求预测值
for (i in 1:30) {
  prvalue <- predict(models[[i]], newdata = result, type = "response")
  prvalues[[i]]<- prvalue}

#批量提取auc
aucresult1 <- tibble()
for(i in names(rocobese)){
result <- auc(rocobese[[i]])
idx <- match(i, names(rocobese)) #使用match函数将列名转换为索引
aucresult1[idx,"varname"] <- i
aucresult1[idx,"obeseauc"] <- as.numeric(result)}

#批量提取最佳截断值
thresholds <- tibble()
for (nm in predvars){ 
threshold <- coords(rocresult[[nm]], "best")
threshold$predvar <- nm
thresholds <- bind_rows(thresholds,threshold)}

结果信息处理并储存到嵌套列表
result_res <- list()for (i in names(result_dats)){  
current_results <- list()  
  for(n in names(result_dats[[i]])){    
      if (nrow(result_dats[[i]][[n]])<3) {      next    }    
res <- mr(result_dats[[i]][[n]])    
res[,"dataid"] <- i    
res[,"samplesize.exposure"] <- result_dats[[i]][[n]][1,"samplesize.exposure"]    res[,"samplesize.outcome"] <- result_dats[[i]][[n]][1,"samplesize.outcome"]    
res$lcl <- res$b - (1.96 * res$se)    
res$ucl <- res$b + (1.96 * res$se)    
res$or <- exp(res$b)    
res$orlcl <- exp(res$lcl)    
res$orucl <- exp(res$ucl)    
current_results[[n]] <- res  }  
result_res[[i]] <- current_results }

暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇