### Logistic Graph Predicting Missing Values library(MASS) S <- matrix(0.5, 4, 4) diag(S) <- 1 n <- 100 dat <- mvrnorm(n, rep(0, 4), S) colnames(dat) <- c("x", "y1", "y2", "y3") dat[runif(n) < 0.2, "y1"] <- NA dat[runif(n) < 1/(1+exp(-(-1.7 + 0.7*dat[,"x"]))), "y2"] <- NA dat[runif(n) < 1/(1+exp(-(-1.7 + 0.7*dat[,"y3"]))), "y3"] <- NA plot(dat[,"x"], is.na(dat[,"y2"]), xlim=c(-4, 6), xlab="X", ylab="p(Missing)") lines(seq(-4, 6, 0.1),1/(1+exp(-(-1.7 + 0.7*seq(-4, 6, 0.1))))) ### Biases of Missing Data from MCAR, MAR, MNAR set.seed(123321) k <- 1000 neach <- 100 groupid <- rep(1:k, each=neach) n <- k*neach v0 <- rnorm(k, 0, 0.7) v0 <- rep(v0, each=neach) r0 <- rnorm(n, 0, 1) x <- 0.5 + v0 + r0 xc <- x - mean(x) # neach is equal across group so mean can be used directly u0 <- rnorm(k, 0, 7) u0 <- rep(u0, each=neach) e0 <- rnorm(n, 0, 7) y <- 50 + 5*x + u0 + e0 xs <- scale(x) ys <- scale(y) m1 <- 0.30 m2 <- 1 / (1 + exp(-(-1.015 + xs))) m3 <- 1 / (1 + exp(-(-1.015 + ys))) y1 <- y2 <- y3 <- y y1[runif(n) < m1] <- NA y2[runif(n) < m2] <- NA y3[runif(n) < m3] <- NA dat <- data.frame(groupid=groupid, y, y1, y2, y3, x) library(lme4) summary(lmer(y ~ 1 + (1|groupid), data=dat, REML=FALSE)) summary(lmer(y1 ~ 1 + (1|groupid), data=dat, REML=FALSE)) summary(lmer(y2 ~ 1 + (1|groupid), data=dat, REML=FALSE)) summary(lmer(y3 ~ 1 + (1|groupid), data=dat, REML=FALSE)) summary(lmer(y2 ~ 1 + I(x-mean(x)) + (1|groupid), data=dat, REML=FALSE)) summary(lmer(y ~ 1 + I(x-mean(x)) + (1|groupid), data=dat, REML=FALSE)) ###################################################### # library(mitml) # install.packages("devtools") # devtools::install_github(repo = "stefvanbuuren/mice") ### Example 1 dat1 <-read.table("lecture11ex1.csv", sep=",", header=TRUE, na.strings="999999") dat1a <- dat1[,c("erid", "score", "eesex", "ersex")] library(mice) md.pattern(dat1a) # Step 1 Transformation to Reduce Computation Burden m_score1a <- mean(dat1a$score, na.rm=TRUE) dat1a$score <- dat1a$score - m_score1a # Step 2 Missing Data Prediction Model pred1a <- make.predictorMatrix(dat1a) pred1a meth1a <- make.method(dat1a) meth1a pred1a[,] <- 0 pred1a["score",] <- c(-2, 0, 4, 1) meth1a["score"] <- "2l.pan" # Step 3 Imputation imp <- mice(dat1a, pred=pred1a, meth=meth1a, m=10, maxit=10, seed=123321) # Step 4 Check Convergence plot(imp, c("score")) imp <- mice(dat1a, pred=pred1a, meth=meth1a, m=30, maxit=20, seed=123321) plot(imp, c("score")) # Step 5 Transform back to original scale impdat1a <- complete(imp, "long", include = TRUE) impdat1a$score <- impdat1a$score + m_score1a imp1a <- as.mids(impdat1a) # Step 6 Analyze each imputed data library(lme4) fit1 <- with(imp1a, lmer(score ~ eesex + ersex + (1|erid), REML=FALSE)) fit2 <- with(imp1a, lmer(score ~ eesex + ersex + (1 + eesex|erid), REML=FALSE)) # Step 7 Pool analysis results out1 <- pool(fit1) summary(out1) out2 <- pool(fit2) summary(out2) source("mimlmtools.R") ranefMI(fit1) ranefMI(fit2) sigmaMI(fit1) sigmaMI(fit2) anovaMI(fit1, fit2) ########################################## ### Example 2 dat1b <- dat1[,c("erid", "score", "eesalary", "eeworkexp", "erexp")] library(mice) library(miceadds) md.pattern(dat1b) # Step 1 Transformation m_score1b <- mean(dat1b$score, na.rm=TRUE) m_eesalary1b <- mean(dat1b$eesalary, na.rm=TRUE) m_eeworkexp1b <- mean(dat1b$eeworkexp, na.rm=TRUE) dat1bg <- dat1b[!duplicated(dat1b$erid),] m_erexp1b <- mean(dat1bg$erexp, na.rm=TRUE) sd_score1b <- sd(dat1b$score, na.rm=TRUE) sd_eesalary1b <- sd(dat1b$eesalary, na.rm=TRUE) sd_eeworkexp1b <- sd(dat1b$eeworkexp, na.rm=TRUE) sd_erexp1b <- sd(dat1bg$erexp, na.rm=TRUE) dat1b$score <- (dat1b$score - m_score1b) / sd_score1b dat1b$eesalary <- (dat1b$eesalary - m_eesalary1b) / sd_eesalary1b dat1b$eeworkexp <- (dat1b$eeworkexp - m_eeworkexp1b) / sd_eeworkexp1b dat1b$erexp <- (dat1b$erexp - m_erexp1b) / sd_erexp1b # Step 2 Group mean centering dat1b$avescore <- NA dat1b$aveeesalary <- NA dat1b$aveeeworkexp <- NA dat1b$diffscore <- NA dat1b$diffeesalary <- NA dat1b$diffeeworkexp <- NA # Step 3 Interaction dat1b$int1 <- NA # diffeeworkexp * erexp dat1b$int2 <- NA # aveeeworkexp * erexp # Step 4 Make Prediction Model meth1b <- make.method(dat1b) meth1b pred1b <- make.predictorMatrix(dat1b) pred1b meth1b[c("score", "eesalary", "eeworkexp")] <- "2l.pan" meth1b["erexp"] <- "2lonly.norm" meth1b[c("avescore", "aveeesalary", "aveeeworkexp")] <- "2l.groupmean" meth1b["diffscore"] <- "~I(score - avescore)" meth1b["diffeesalary"] <- "~I(eesalary - aveeesalary)" meth1b["diffeeworkexp"] <- "~I(eeworkexp - aveeeworkexp)" meth1b["int1"] <- "~I(diffeeworkexp*erexp)" meth1b["int2"] <- "~I(aveeeworkexp*erexp)" pred1b[,] <- 0 pred1b[c("score", "eesalary", "eeworkexp", "erexp"), "erid"] <- -2 pred1b[c("avescore", "aveeesalary", "aveeeworkexp"), "erid"] <- -2 pred1b["score", c("aveeesalary", "aveeeworkexp", "erexp")] <- 1 pred1b["score", c("diffeesalary", "diffeeworkexp")] <- 2 pred1b["score", c("int1", "int2")] <- 1 pred1b["eesalary", c("avescore", "aveeeworkexp", "erexp")] <- 1 pred1b["eesalary", c("diffscore", "diffeeworkexp")] <- 2 pred1b["eesalary", c("int1", "int2")] <- 1 pred1b["eeworkexp", c("avescore", "aveeesalary", "erexp")] <- 1 pred1b["eeworkexp", c("diffscore", "diffeesalary")] <- 2 pred1b["erexp", c("avescore", "aveeesalary", "aveeeworkexp")] <- 1 pred1b["avescore", "score"] <- 1 pred1b["aveeesalary", "eesalary"] <- 2 pred1b["aveeeworkexp", "eeworkexp"] <- 2 # Step 5 Determine the sequence of imputation visit1b <- c("score", "avescore", "diffscore", "eesalary", "aveeesalary", "diffeesalary", "erexp", "int1", "int2", "eeworkexp", "aveeeworkexp", "diffeeworkexp", "int1", "int2") # Step 6 Impute data imp1b <- mice(dat1b, pred=pred1b, meth=meth1b, m=10, maxit=10, visit=visit1b, seed=123321, allow.na = TRUE) # Step 7 Check convergence plot(imp1b, c("score", "eesalary")) plot(imp1b, c("eeworkexp", "erexp")) imp1b <- mice(dat1b, pred=pred1b, meth=meth1b, m=30, maxit=20, visit=visit1b, seed=123321, allow.na = TRUE) plot(imp1b, c("score", "eesalary")) plot(imp1b, c("eeworkexp", "erexp")) # Step 8 Transform back to original scale impdat1b <- complete(imp1b, "long", include = TRUE) impdat1b$score <- sd_score1b*impdat1b$score + m_score1b impdat1b$eesalary <- sd_eesalary1b*impdat1b$eesalary + m_eesalary1b impdat1b$eeworkexp <- sd_eeworkexp1b*impdat1b$eeworkexp + m_eeworkexp1b impdat1b$erexp <- sd_erexp1b*impdat1b$erexp + m_erexp1b imp1b2 <- as.mids(impdat1b) # Step 9 Analyze each data library(lme4) fit1b1 <- with(imp1b2, lmer(score ~ eesalary + eeworkexp + erexp + (1|erid), REML=FALSE)) fit1b2 <- with(imp1b2, lmer(score ~ eesalary + eeworkexp + erexp + eeworkexp*erexp+ (1|erid), REML=FALSE)) # Step 10 Pool each analysis result out1b1 <- pool(fit1b1) summary(out1b1) out1b2 <- pool(fit1b2) summary(out1b2) source("mimlmtools.R") ranefMI(fit1b1) ranefMI(fit1b2) sigmaMI(fit1b1) sigmaMI(fit1b2) anovaMI(fit1b1, fit1b2) ######################################################### ### Example 3 dat2 <-read.table("lecture11ex2.csv", sep=",", header=TRUE, na.strings="999999") library(mice) library(miceadds) md.pattern(dat2) # Step 1 Rescale data m_efficacy2 <- mean(dat2$efficacy, na.rm=TRUE) m_ach2 <- mean(dat2$ach, na.rm=TRUE) sd_efficacy2 <- sd(dat2$efficacy, na.rm=TRUE) sd_ach2 <- sd(dat2$ach, na.rm=TRUE) dat2$efficacy <- (dat2$efficacy - m_efficacy2) / sd_efficacy2 dat2$ach <- (dat2$ach - m_ach2) / sd_ach2 # Step 2 Prepare for group-mean centered, deviation, interaction dat2$aveach <- NA dat2$aveefficacy <- NA dat2$diffach <- NA dat2$diffefficacy <- NA dat2$intefficacy <- NA # Step 3 Make imputation model meth2 <- make.method(dat2) meth2 pred2 <- make.predictorMatrix(dat2) pred2 meth2[c("efficacy", "ach")] <- "2l.pan" meth2[c("aveefficacy", "aveach")] <- "2l.groupmean" meth2["diffach"] <- "~I(ach - aveach)" meth2["diffefficacy"] <- "~I(efficacy - aveefficacy)" meth2["intefficacy"] <- "~I(aveefficacy*diffefficacy)" pred2[,] <- 0 pred2[c("ach", "efficacy", "aveach", "aveefficacy"), "schoolid"] <- -2 pred2["efficacy", c("aveach", "diffach", "private")] <- c(1, 2, 1) pred2["ach", c("aveefficacy", "diffefficacy", "intefficacy", "private")] <- c(1, 2, 1, 1) pred2["aveach", "ach"] <- 2 pred2["aveefficacy", "efficacy"] <- 2 # Step 4 Determine the sequence of imputation visit2 <- c("efficacy", "aveefficacy", "diffefficacy", "intefficacy", "ach", "aveach", "diffach") # Step 5 Impute data imp2 <- mice(dat2, pred=pred2, meth=meth2, m=30, maxit=20, visit=visit2, seed=123321, allow.na = TRUE) # Step 6 Check convergence plot(imp2, c("ach", "efficacy")) # Step 7 Transform data back to original scale impdat2 <- complete(imp2, "long", include = TRUE) impdat2$efficacy <- sd_efficacy2*impdat2$efficacy + m_efficacy2 impdat2$ach <- sd_ach2*impdat2$ach + m_ach2 imp22 <- as.mids(impdat2) # Step 8 Analyze imputed data library(lme4) fit21 <- with(imp22, lmer(ach ~ I(efficacy - ave(efficacy, schoolid)) + I(ave(efficacy, schoolid)) + private + (1 + I(efficacy - ave(efficacy, schoolid))|schoolid), REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead"))) fit22 <- with(imp22, lmer(ach ~ I(efficacy - ave(efficacy, schoolid))*I(ave(efficacy, schoolid)) + private + (1 + I(efficacy - ave(efficacy, schoolid))|schoolid), REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead"))) # STep 9 Pool the results out21 <- pool(fit21) summary(out21) out22 <- pool(fit22) summary(out22) source("mimlmtools.R") ranefMI(fit21) ranefMI(fit22) sigmaMI(fit21) sigmaMI(fit22) anovaMI(fit21, fit22) ########################################################################### ### Example 4 library(mice) library(miceadds) dat3 <-read.table("lecture11ex3.csv", sep=",", header=TRUE, na.strings="999999") md.pattern(dat3) aggregate(stress ~ time, data = dat3, FUN = function(x) sum(is.na(x)), na.action = NULL) # Step 1 Change the scales of data m_mem3 <- mean(dat3$mem, na.rm=TRUE) m_stress3 <- mean(dat3$stress, na.rm=TRUE) sd_mem3 <- sd(dat3$mem, na.rm=TRUE) sd_stress3 <- sd(dat3$stress, na.rm=TRUE) dat3$mem <- (dat3$mem - m_mem3) / sd_mem3 dat3$stress <- (dat3$stress - m_stress3) / sd_stress3 dat3g <- dat3[dat3$time == 1,] m_act3 <- mean(dat3g$act, na.rm=TRUE) m_ext3 <- mean(dat3g$ext, na.rm=TRUE) sd_act3 <- sd(dat3g$act, na.rm=TRUE) sd_ext3 <- sd(dat3g$ext, na.rm=TRUE) dat3$act <- (dat3$act - m_act3) / sd_act3 dat3$ext <- (dat3$ext - m_ext3) / sd_ext3 # Step 2 Center time variable dat3$time <- dat3$time - 1 # Step 3 Group means, Deviation, Interactions dat3$intact <- dat3$time * dat3$act dat3$intext <- dat3$time * dat3$ext dat3$intcohort <- dat3$time * dat3$cohort dat3$intfemale <- dat3$time * dat3$female dat3$avemem <- NA dat3$diffmem <- NA dat3$avestress <- NA dat3$diffstress <- NA dat3$intmem1 <- NA dat3$intmem2 <- NA dat3$intstress1 <- NA dat3$intstress2 <- NA # Step 4 Make imputation model pred3 <- make.predictorMatrix(dat3) pred3 meth3 <- make.method(dat3) meth3 pred3[,] <- 0 pred3["mem", c("pid", "time", "cohort", "female", "act", "ext", "avestress", "diffstress", "intstress1", "intstress2", "intact", "intext", "intcohort", "intfemale")] <- c(-2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) pred3["stress", c("pid", "time", "cohort", "female", "act", "ext", "avemem", "diffmem", "intmem1", "intmem2", "intact", "intext", "intcohort", "intfemale")] <- c(-2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) pred3["avemem", c("pid", "mem")] <- c(-2, 2) pred3["avestress", c("pid", "stress")] <- c(-2, 2) meth3[c("mem", "stress")] <- "2l.pan" meth3[c("avemem", "avestress")] <- "2l.groupmean" meth3["diffmem"] <- "~I(mem - avemem)" meth3["diffstress"] <- "~I(stress - avestress)" meth3["intmem1"] <- "~I(time*avemem)" meth3["intmem2"] <- "~I(time*diffmem)" meth3["intstress1"] <- "~I(time*avestress)" meth3["intstress2"] <- "~I(time*diffstress)" # Step 5 Sequence of imputation visit3 <- c("mem", "avemem", "diffmem", "intmem1", "intmem2", "stress", "avestress", "diffstress", "intstress1", "intstress2") # Step 6 Impute data imp3 <- mice(dat3, pred=pred3, meth=meth3, m=30, maxit=20, visit=visit3, seed=123321, allow.na = TRUE) # Step 7 Check convergence plot(imp3, c("mem", "stress")) # Step 8 Transform data back to original scale impdat3 <- complete(imp3, "long", include = TRUE) impdat3$mem <- sd_mem3*impdat3$mem + m_mem3 impdat3$stress <- sd_stress3*impdat3$stress + m_stress3 impdat3$act <- sd_act3*impdat3$act + m_act3 impdat3$ext <- sd_ext3*impdat3$ext + m_ext3 imp32 <- as.mids(impdat3) # Step 9 Analyze each data library(lme4) fit31 <- with(imp32, lmer(mem ~ time + (1|pid), REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead"))) fit32 <- with(imp32, lmer(mem ~ time + (1 + time|pid), REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead"))) # Step 10 Pool the results out31 <- pool(fit31) summary(out31) out32 <- pool(fit32) summary(out32) source("mimlmtools.R") ranefMI(fit31) ranefMI(fit32) sigmaMI(fit31) sigmaMI(fit32) anovaMI(fit31, fit32) ########################################################################################### ### Example 4.2 Impute in wide format dat3 <-read.table("lecture10ex3.csv", sep=",", header=TRUE, na.strings="999999") dat3 <- dat3[,-1] psych::describe(dat3) # Change data to wide format dat3w <- reshape(data=dat3, idvar="pid", timevar="time", v.names=c("mem", "stress"), direction="wide") library(mice) md.pattern(dat3w) # Step 1-3 Not needed # Step 4 Make imputation model pred3w <- make.predictorMatrix(dat3w) pred3w meth3w <- make.method(dat3w) meth3w pred3w[,] <- 0 pred3w[c("mem.2", "mem.3", "mem.4", "stress.2", "stress.3", "stress.4"), ] <- 1 pred3w[, "pid"] <- 0 diag(pred3w) <- 0 meth3w[c("mem.2", "mem.3", "mem.4", "stress.2", "stress.3", "stress.4")] <- "pmm" # Step 5 Not needed # Step 6 Impute data imp3w <- mice(dat3w, pred=pred3w, meth=meth3w, m=30, maxit=20, seed=123321) # Step 7 Check convergence plot(imp3w, c("mem.2", "mem.3")) plot(imp3w, c("mem.4", "stress.2")) plot(imp3w, c("stress.3", "stress.4")) # Step 8 Not needed # However, need to change data back to long format impdat3w <- complete(imp3w, "long", include = TRUE) mem <- c("mem.1", "mem.2", "mem.3", "mem.4") stress <- c("stress.1", "stress.2", "stress.3", "stress.4") timevarying <- list(mem, stress) impdat3w$id2 <- 1:nrow(impdat3w) dat3l <- reshape(data=impdat3w, idvar="id2", times=0:3, timevar="time", varying = timevarying, v.names=c("mem", "stress"), direction="long") dat3l$.id <- ave(dat3l$id2, dat3l$.imp, FUN=seq_along) imp3w2 <- as.mids(dat3l) # Step 9 Analyze each data library(lme4) fit3w1 <- with(imp3w2, lmer(mem ~ time + (1|pid), REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead"))) fit3w2 <- with(imp3w2, lmer(mem ~ time + (1 + time|pid), REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead"))) # Step 10 Pool the results out3w1 <- pool(fit3w1) summary(out3w1) out3w2 <- pool(fit3w2) summary(out3w2) source("mimlmtools.R") ranefMI(fit3w1) ranefMI(fit3w2) sigmaMI(fit3w1) sigmaMI(fit3w2) anovaMI(fit3w1, fit3w2)