### Logistic Graph Predicting Missing Values

library(MASS)
S <- matrix(0.5, 4, 4)
diag(S) <- 1
n <- 100
dat <- mvrnorm(n, rep(0, 4), S)
colnames(dat) <- c("x", "y1", "y2", "y3")

dat[runif(n) < 0.2, "y1"] <- NA
dat[runif(n) < 1/(1+exp(-(-1.7 + 0.7*dat[,"x"]))), "y2"] <- NA
dat[runif(n) < 1/(1+exp(-(-1.7 + 0.7*dat[,"y3"]))), "y3"] <- NA

plot(dat[,"x"], is.na(dat[,"y2"]), xlim=c(-4, 6), xlab="X", ylab="p(Missing)")
lines(seq(-4, 6, 0.1),1/(1+exp(-(-1.7 + 0.7*seq(-4, 6, 0.1)))))

### Biases of Missing Data from MCAR, MAR, MNAR

set.seed(123321)
k <- 1000
neach <- 100
groupid <- rep(1:k, each=neach)
n <- k*neach
v0 <- rnorm(k, 0, 0.7)
v0 <- rep(v0, each=neach)
r0 <- rnorm(n, 0, 1)
x <- 0.5 + v0 + r0
xc <- x - mean(x) # neach is equal across group so mean can be used directly
u0 <- rnorm(k, 0, 7)
u0 <- rep(u0, each=neach)
e0 <- rnorm(n, 0, 7)
y <- 50 + 5*x + u0 + e0

xs <- scale(x)
ys <- scale(y)
m1 <- 0.30
m2 <- 1 / (1 + exp(-(-1.015 + xs)))
m3 <- 1 / (1 + exp(-(-1.015 + ys)))
y1 <- y2 <- y3 <- y
y1[runif(n) < m1] <- NA
y2[runif(n) < m2] <- NA
y3[runif(n) < m3] <- NA

dat <- data.frame(groupid=groupid, y, y1, y2, y3, x)
library(lme4)
summary(lmer(y ~ 1 + (1|groupid), data=dat, REML=FALSE))
summary(lmer(y1 ~ 1 + (1|groupid), data=dat, REML=FALSE))
summary(lmer(y2 ~ 1 + (1|groupid), data=dat, REML=FALSE))
summary(lmer(y3 ~ 1 + (1|groupid), data=dat, REML=FALSE))
summary(lmer(y2 ~ 1 + I(x-mean(x)) + (1|groupid), data=dat, REML=FALSE))
summary(lmer(y ~ 1 + I(x-mean(x)) + (1|groupid), data=dat, REML=FALSE))


######################################################
# library(mitml)
# install.packages("devtools")
# devtools::install_github(repo = "stefvanbuuren/mice")


### Example 1

dat1 <-read.table("lecture11ex1.csv", sep=",", header=TRUE, na.strings="999999")
dat1a <- dat1[,c("erid", "score", "eesex", "ersex")]
library(mice)
md.pattern(dat1a)

# Step 1 Transformation to Reduce Computation Burden

m_score1a <- mean(dat1a$score, na.rm=TRUE)
dat1a$score <- dat1a$score - m_score1a

# Step 2 Missing Data Prediction Model

pred1a <- make.predictorMatrix(dat1a)
pred1a
meth1a <- make.method(dat1a)
meth1a

pred1a[,] <- 0
pred1a["score",] <- c(-2, 0, 4, 1)
meth1a["score"] <- "2l.pan"

# Step 3 Imputation

imp <- mice(dat1a, pred=pred1a, meth=meth1a, m=10, maxit=10, seed=123321)

# Step 4 Check Convergence

plot(imp, c("score"))

imp <- mice(dat1a, pred=pred1a, meth=meth1a, m=30, maxit=20, seed=123321)
plot(imp, c("score"))

# Step 5 Transform back to original scale

impdat1a <- complete(imp, "long", include = TRUE)
impdat1a$score <- impdat1a$score + m_score1a
imp1a <- as.mids(impdat1a)

# Step 6 Analyze each imputed data

library(lme4)
fit1 <- with(imp1a, lmer(score ~ eesex + ersex + (1|erid), REML=FALSE))
fit2 <- with(imp1a, lmer(score ~ eesex + ersex + (1 + eesex|erid), REML=FALSE))

# Step 7 Pool analysis results

out1 <- pool(fit1)
summary(out1)

out2 <- pool(fit2)
summary(out2)

source("mimlmtools.R")
ranefMI(fit1)
ranefMI(fit2)

sigmaMI(fit1)
sigmaMI(fit2)

anovaMI(fit1, fit2)


##########################################

### Example 2

dat1b <- dat1[,c("erid", "score", "eesalary", "eeworkexp", "erexp")]
library(mice)
library(miceadds)
md.pattern(dat1b)

# Step 1 Transformation

m_score1b <- mean(dat1b$score, na.rm=TRUE)
m_eesalary1b <- mean(dat1b$eesalary, na.rm=TRUE)
m_eeworkexp1b <- mean(dat1b$eeworkexp, na.rm=TRUE)
dat1bg <- dat1b[!duplicated(dat1b$erid),]
m_erexp1b <- mean(dat1bg$erexp, na.rm=TRUE)

sd_score1b <- sd(dat1b$score, na.rm=TRUE)
sd_eesalary1b <- sd(dat1b$eesalary, na.rm=TRUE)
sd_eeworkexp1b <- sd(dat1b$eeworkexp, na.rm=TRUE)
sd_erexp1b <- sd(dat1bg$erexp, na.rm=TRUE)

dat1b$score <- (dat1b$score - m_score1b) / sd_score1b
dat1b$eesalary <- (dat1b$eesalary - m_eesalary1b) / sd_eesalary1b
dat1b$eeworkexp <- (dat1b$eeworkexp - m_eeworkexp1b) / sd_eeworkexp1b
dat1b$erexp <- (dat1b$erexp - m_erexp1b) / sd_erexp1b

# Step 2 Group mean centering

dat1b$avescore <- NA
dat1b$aveeesalary <- NA
dat1b$aveeeworkexp <- NA

dat1b$diffscore <- NA
dat1b$diffeesalary <- NA
dat1b$diffeeworkexp <- NA


# Step 3 Interaction

dat1b$int1 <- NA # diffeeworkexp * erexp
dat1b$int2 <- NA # aveeeworkexp * erexp

# Step 4 Make Prediction Model

meth1b <- make.method(dat1b)
meth1b
pred1b <- make.predictorMatrix(dat1b)
pred1b

meth1b[c("score", "eesalary", "eeworkexp")] <- "2l.pan"
meth1b["erexp"] <- "2lonly.norm"
meth1b[c("avescore", "aveeesalary", "aveeeworkexp")] <- "2l.groupmean"
meth1b["diffscore"] <- "~I(score - avescore)"
meth1b["diffeesalary"] <- "~I(eesalary - aveeesalary)"
meth1b["diffeeworkexp"] <- "~I(eeworkexp - aveeeworkexp)"
meth1b["int1"] <- "~I(diffeeworkexp*erexp)"
meth1b["int2"] <- "~I(aveeeworkexp*erexp)"

pred1b[,] <- 0
pred1b[c("score", "eesalary", "eeworkexp", "erexp"), "erid"] <- -2
pred1b[c("avescore", "aveeesalary", "aveeeworkexp"), "erid"] <- -2
pred1b["score", c("aveeesalary", "aveeeworkexp", "erexp")] <- 1
pred1b["score", c("diffeesalary", "diffeeworkexp")] <- 2
pred1b["score", c("int1", "int2")] <- 1
pred1b["eesalary", c("avescore", "aveeeworkexp", "erexp")] <- 1
pred1b["eesalary", c("diffscore", "diffeeworkexp")] <- 2
pred1b["eesalary", c("int1", "int2")] <- 1
pred1b["eeworkexp", c("avescore", "aveeesalary", "erexp")] <- 1
pred1b["eeworkexp", c("diffscore", "diffeesalary")] <- 2
pred1b["erexp", c("avescore", "aveeesalary", "aveeeworkexp")] <- 1
pred1b["avescore", "score"] <- 1
pred1b["aveeesalary", "eesalary"] <- 2
pred1b["aveeeworkexp", "eeworkexp"] <- 2

# Step 5 Determine the sequence of imputation

visit1b <- c("score", "avescore", "diffscore", 
             "eesalary", "aveeesalary", "diffeesalary", 
             "erexp", "int1", "int2",
             "eeworkexp", "aveeeworkexp", "diffeeworkexp",
             "int1", "int2")

# Step 6 Impute data

imp1b <- mice(dat1b, pred=pred1b, meth=meth1b, m=10, maxit=10, 
              visit=visit1b, seed=123321, allow.na = TRUE)

# Step 7 Check convergence

plot(imp1b, c("score", "eesalary"))
plot(imp1b, c("eeworkexp", "erexp"))

imp1b <- mice(dat1b, pred=pred1b, meth=meth1b, m=30, maxit=20, 
              visit=visit1b, seed=123321, allow.na = TRUE)

plot(imp1b, c("score", "eesalary"))
plot(imp1b, c("eeworkexp", "erexp"))

# Step 8 Transform back to original scale

impdat1b <- complete(imp1b, "long", include = TRUE)
impdat1b$score <- sd_score1b*impdat1b$score + m_score1b
impdat1b$eesalary <- sd_eesalary1b*impdat1b$eesalary + m_eesalary1b
impdat1b$eeworkexp <- sd_eeworkexp1b*impdat1b$eeworkexp + m_eeworkexp1b
impdat1b$erexp <- sd_erexp1b*impdat1b$erexp + m_erexp1b
imp1b2 <- as.mids(impdat1b)

# Step 9 Analyze each data

library(lme4)
fit1b1 <- with(imp1b2, lmer(score ~ eesalary + eeworkexp 
                            + erexp + (1|erid), REML=FALSE))
fit1b2 <- with(imp1b2, lmer(score ~ eesalary + eeworkexp 
                            + erexp + eeworkexp*erexp+ (1|erid), REML=FALSE))

# Step 10 Pool each analysis result

out1b1 <- pool(fit1b1)
summary(out1b1)
out1b2 <- pool(fit1b2)
summary(out1b2)

source("mimlmtools.R")
ranefMI(fit1b1)
ranefMI(fit1b2)
sigmaMI(fit1b1)
sigmaMI(fit1b2)

anovaMI(fit1b1, fit1b2)

#########################################################

### Example 3

dat2 <-read.table("lecture11ex2.csv", sep=",", header=TRUE, na.strings="999999")
library(mice)
library(miceadds)
md.pattern(dat2)

# Step 1 Rescale data

m_efficacy2 <- mean(dat2$efficacy, na.rm=TRUE)
m_ach2 <- mean(dat2$ach, na.rm=TRUE)
sd_efficacy2 <- sd(dat2$efficacy, na.rm=TRUE)
sd_ach2 <- sd(dat2$ach, na.rm=TRUE)
dat2$efficacy <- (dat2$efficacy - m_efficacy2) / sd_efficacy2
dat2$ach <- (dat2$ach - m_ach2) / sd_ach2

# Step 2 Prepare for group-mean centered, deviation, interaction

dat2$aveach <- NA
dat2$aveefficacy <- NA
dat2$diffach <- NA
dat2$diffefficacy <- NA
dat2$intefficacy <- NA

# Step 3 Make imputation model

meth2 <- make.method(dat2)
meth2
pred2 <- make.predictorMatrix(dat2)
pred2

meth2[c("efficacy", "ach")] <- "2l.pan"
meth2[c("aveefficacy", "aveach")] <- "2l.groupmean"
meth2["diffach"] <- "~I(ach - aveach)"
meth2["diffefficacy"] <- "~I(efficacy - aveefficacy)"
meth2["intefficacy"] <- "~I(aveefficacy*diffefficacy)"

pred2[,] <- 0
pred2[c("ach", "efficacy", "aveach", "aveefficacy"), "schoolid"] <- -2
pred2["efficacy", c("aveach", "diffach", "private")] <- c(1, 2, 1)
pred2["ach", c("aveefficacy", "diffefficacy", "intefficacy", "private")] <- c(1, 2, 1, 1)
pred2["aveach", "ach"] <- 2
pred2["aveefficacy", "efficacy"] <- 2

# Step 4 Determine the sequence of imputation

visit2 <- c("efficacy", "aveefficacy", "diffefficacy", "intefficacy", 
            "ach", "aveach", "diffach")

# Step 5 Impute data

imp2 <- mice(dat2, pred=pred2, meth=meth2, m=30, maxit=20, 
             visit=visit2, seed=123321, allow.na = TRUE)

# Step 6 Check convergence

plot(imp2, c("ach", "efficacy"))

# Step 7 Transform data back to original scale

impdat2 <- complete(imp2, "long", include = TRUE)
impdat2$efficacy <- sd_efficacy2*impdat2$efficacy + m_efficacy2
impdat2$ach <- sd_ach2*impdat2$ach + m_ach2
imp22 <- as.mids(impdat2)

# Step 8 Analyze imputed data

library(lme4)
fit21 <- with(imp22, 
              lmer(ach ~ I(efficacy - ave(efficacy, schoolid)) + I(ave(efficacy, schoolid)) 
                   + private + (1 + I(efficacy - ave(efficacy, schoolid))|schoolid), 
                   REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead")))
fit22 <- with(imp22, 
              lmer(ach ~ I(efficacy - ave(efficacy, schoolid))*I(ave(efficacy, schoolid)) 
                   + private + (1 + I(efficacy - ave(efficacy, schoolid))|schoolid), 
                   REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead")))

# STep 9 Pool the results

out21 <- pool(fit21)
summary(out21)
out22 <- pool(fit22)
summary(out22)

source("mimlmtools.R")
ranefMI(fit21)
ranefMI(fit22)
sigmaMI(fit21)
sigmaMI(fit22)

anovaMI(fit21, fit22)

###########################################################################

### Example 4

library(mice)
library(miceadds)

dat3 <-read.table("lecture11ex3.csv", sep=",", header=TRUE, na.strings="999999")
md.pattern(dat3)

aggregate(stress ~ time, data = dat3, FUN = function(x) sum(is.na(x)), na.action = NULL)

# Step 1 Change the scales of data

m_mem3 <- mean(dat3$mem, na.rm=TRUE)
m_stress3 <- mean(dat3$stress, na.rm=TRUE)
sd_mem3 <- sd(dat3$mem, na.rm=TRUE)
sd_stress3 <- sd(dat3$stress, na.rm=TRUE)
dat3$mem <- (dat3$mem - m_mem3) / sd_mem3
dat3$stress <- (dat3$stress - m_stress3) / sd_stress3

dat3g <- dat3[dat3$time == 1,]
m_act3 <- mean(dat3g$act, na.rm=TRUE)
m_ext3 <- mean(dat3g$ext, na.rm=TRUE)
sd_act3 <- sd(dat3g$act, na.rm=TRUE)
sd_ext3 <- sd(dat3g$ext, na.rm=TRUE)
dat3$act <- (dat3$act - m_act3) / sd_act3
dat3$ext <- (dat3$ext - m_ext3) / sd_ext3

# Step 2 Center time variable

dat3$time <- dat3$time - 1

# Step 3 Group means, Deviation, Interactions

dat3$intact <- dat3$time * dat3$act
dat3$intext <- dat3$time * dat3$ext
dat3$intcohort <- dat3$time * dat3$cohort
dat3$intfemale <- dat3$time * dat3$female

dat3$avemem <- NA
dat3$diffmem <- NA
dat3$avestress <- NA
dat3$diffstress <- NA
dat3$intmem1 <- NA
dat3$intmem2 <- NA
dat3$intstress1 <- NA
dat3$intstress2 <- NA

# Step 4 Make imputation model

pred3 <- make.predictorMatrix(dat3)
pred3
meth3 <- make.method(dat3)
meth3

pred3[,] <- 0
pred3["mem", c("pid", "time", "cohort", "female", 
              "act", "ext", "avestress", "diffstress", "intstress1", 
              "intstress2", "intact", "intext", "intcohort", "intfemale")] <- c(-2, 2, 1, 1,
                                                                                1, 1, 1, 1, 1,
                                                                                1, 1, 1, 1, 1)
pred3["stress", c("pid", "time", "cohort", "female", 
              "act", "ext", "avemem", "diffmem", "intmem1", 
              "intmem2", "intact", "intext", "intcohort", "intfemale")] <- c(-2, 2, 1, 1,
                                                                                1, 1, 1, 1, 1,
                                                                                1, 1, 1, 1, 1)
pred3["avemem", c("pid", "mem")] <- c(-2, 2)
pred3["avestress", c("pid", "stress")] <- c(-2, 2)

meth3[c("mem", "stress")] <- "2l.pan"
meth3[c("avemem", "avestress")] <- "2l.groupmean"
meth3["diffmem"] <- "~I(mem - avemem)"
meth3["diffstress"] <- "~I(stress - avestress)"
meth3["intmem1"] <- "~I(time*avemem)"
meth3["intmem2"] <- "~I(time*diffmem)"
meth3["intstress1"] <- "~I(time*avestress)"
meth3["intstress2"] <- "~I(time*diffstress)"

# Step 5 Sequence of imputation

visit3 <- c("mem", "avemem", "diffmem", "intmem1", "intmem2", 
            "stress", "avestress", "diffstress", "intstress1", "intstress2")

# Step 6 Impute data

imp3 <- mice(dat3, pred=pred3, meth=meth3, m=30, maxit=20, 
             visit=visit3, seed=123321, allow.na = TRUE)

# Step 7 Check convergence

plot(imp3, c("mem", "stress"))

# Step 8 Transform data back to original scale

impdat3 <- complete(imp3, "long", include = TRUE)
impdat3$mem <- sd_mem3*impdat3$mem + m_mem3
impdat3$stress <- sd_stress3*impdat3$stress + m_stress3
impdat3$act <- sd_act3*impdat3$act + m_act3
impdat3$ext <- sd_ext3*impdat3$ext + m_ext3
imp32 <- as.mids(impdat3)

# Step 9 Analyze each data

library(lme4)
fit31 <- with(imp32, lmer(mem ~ time + (1|pid), REML=FALSE, 
                          control = lmerControl(optimizer ="Nelder_Mead")))
fit32 <- with(imp32, lmer(mem ~ time + (1 + time|pid), REML=FALSE, 
                          control = lmerControl(optimizer ="Nelder_Mead")))

# Step 10 Pool the results

out31 <- pool(fit31)
summary(out31)

out32 <- pool(fit32)
summary(out32)

source("mimlmtools.R")
ranefMI(fit31)
ranefMI(fit32)
sigmaMI(fit31)
sigmaMI(fit32)

anovaMI(fit31, fit32)

###########################################################################################

### Example 4.2 Impute in wide format

dat3 <-read.table("lecture10ex3.csv", sep=",", header=TRUE, na.strings="999999")
dat3 <- dat3[,-1]
psych::describe(dat3)

# Change data to wide format

dat3w <- reshape(data=dat3, idvar="pid", timevar="time", v.names=c("mem", "stress"), direction="wide")

library(mice)
md.pattern(dat3w)

# Step 1-3 Not needed

# Step 4 Make imputation model

pred3w <- make.predictorMatrix(dat3w)
pred3w
meth3w <- make.method(dat3w)
meth3w

pred3w[,] <- 0
pred3w[c("mem.2", "mem.3", "mem.4", "stress.2", "stress.3", "stress.4"), ] <- 1
pred3w[, "pid"] <- 0
diag(pred3w) <- 0

meth3w[c("mem.2", "mem.3", "mem.4", "stress.2", "stress.3", "stress.4")] <- "pmm"

# Step 5 Not needed
# Step 6 Impute data

imp3w <- mice(dat3w, pred=pred3w, meth=meth3w, m=30, maxit=20, seed=123321)

# Step 7 Check convergence

plot(imp3w, c("mem.2", "mem.3"))
plot(imp3w, c("mem.4", "stress.2"))
plot(imp3w, c("stress.3", "stress.4"))

# Step 8 Not needed
# However, need to change data back to long format

impdat3w <- complete(imp3w, "long", include = TRUE)

mem <- c("mem.1", "mem.2", "mem.3", "mem.4")
stress <- c("stress.1", "stress.2", "stress.3", "stress.4")
timevarying <- list(mem, stress)

impdat3w$id2 <- 1:nrow(impdat3w)

dat3l <- reshape(data=impdat3w, idvar="id2", times=0:3, timevar="time", 
                 varying = timevarying, v.names=c("mem", "stress"), direction="long")

dat3l$.id <- ave(dat3l$id2, dat3l$.imp, FUN=seq_along)
imp3w2 <- as.mids(dat3l)

# Step 9 Analyze each data

library(lme4)
fit3w1 <- with(imp3w2, lmer(mem ~ time + (1|pid), REML=FALSE, 
                          control = lmerControl(optimizer ="Nelder_Mead")))
fit3w2 <- with(imp3w2, lmer(mem ~ time + (1 + time|pid), REML=FALSE, 
                          control = lmerControl(optimizer ="Nelder_Mead")))

# Step 10 Pool the results

out3w1 <- pool(fit3w1)
summary(out3w1)

out3w2 <- pool(fit3w2)
summary(out3w2)

source("mimlmtools.R")
ranefMI(fit3w1)
ranefMI(fit3w2)
sigmaMI(fit3w1)
sigmaMI(fit3w2)

anovaMI(fit3w1, fit3w2)