注:本章内容介绍了excel中常用的查询填补法,该方法相较于Excel中的功能更为强大,自定义程度高,可规定按照查询到的值替换原数据或仅替换原数据的缺失值。
数据集比较
#修改数据格式使对应
bcdata <- bcdata %>% mutate(across(all_of(variables),~as.numeric(as.character(.))))
str(bcdata)
str(df_filled)
#比较两数据集的差异
#方法1简单比较-按列名匹配(适用于行顺序对应的两数据集)
result <- bcdata %>%
mutate(across(everything(), ~ifelse(. != df_filled2[[cur_column()]]|is.na(.) != is.na(df_filled2[[cur_column()]]), ., NA)))
# 输出补充数据集和原数据集不同的值
result %>% filter(!is.na(idcard))
#方法2精确比较-按同一id行和列名匹配(两数据集行位置不对应,先根据标识id变量匹配)
result <- bcdata %>%
left_join(df_filled2, by = "id", suffix = c("_a", "_b")) %>%
mutate(across(ends_with("_a"), ~ifelse(. != get(sub("a$", "b", cur_column()))|is.na(.) != is.na(get(sub("a$", "b", cur_column()))), ., NA)))
# 输出补充数据集和原数据集不同的值
result %>% filter(!is.na(idcard_a))
# 查看通过标识符如name、id可以匹配多少观测
c1 <- df_filled %>% select("name","idcard") %>% mutate(match=1)
c2 <- bcdata %>% select("name","idcard")
c3 <- c2 %>% left_join(c1,by=c("name","idcard"))
c3 %>% filter(is.na(match)) #查看未匹配到的观测,检查是否有数据问题
查询填补
#1.简单插补--插补变量在原数据集不存在
filled_data <- df_filled %>%
left_join(bcdata[,c("idcard", "name","variable")], by = c("idcard", "name"))
#2.复杂填补--插补变量在原数据集存在,仅需插补原数据集中的缺失值
# 获取需要填补的变量名
variables_to_fill <- setdiff(names(bcdata), c("name","idcard","filename")) #获取填补变量名
filled_data <- df_filled %>%
left_join(bcdata, by = c("idcard", "name"), suffix = c("", ".ref")) %>%
mutate(across(
all_of(variables_to_fill),
~ coalesce(., get(paste0(cur_column(),".ref"))))) %>%
select(-ends_with(".ref"))
数据更新
#数据更新--插补变量在原数据集存在,需要用新数据集中对应变量的非缺失值更新原数据集
variables_to_fill <- setdiff(intersect(names(new_data), names(origin_data)), c("id","name","idcard","pc","j1"))
test <- origin_data %>%
left_join(new_data[c("id",variables_to_fill)], by = "id", suffix = c("", ".ref")) %>%
mutate(across(
all_of(variables_to_fill),
~ ifelse(
!is.na(get(paste0(cur_column(), ".ref"))) & str_trim(get(paste0(cur_column(), ".ref"))) != "",
get(paste0(cur_column(), ".ref")),.))) %>%
select(-ends_with(".ref"))