dat <- read.csv("http://www.sunthud.com/glm.csv", na.string = "999999") colnames(dat)[1] <- "university" dat[,1] <- factor(dat[,1]) dat[,2] <- factor(dat[,2], labels = c("male", "female")) dat[,5] <- factor(dat[,5], labels = c("science", "social", "humanities", "health", "business")) dat[,8] <- factor(dat[,8], labels = c("y", "n")) dat[,9] <- factor(dat[,9], labels = c("y", "n")) dat[,10] <- factor(dat[,10], labels = c("y", "n")) dat[,11] <- factor(dat[,11], labels = c("y", "n")) dat[,8] <- relevel(dat[,8], ref = "n") dat[,9] <- relevel(dat[,9], ref = "n") dat[,10] <- relevel(dat[,10], ref = "n") dat[,11] <- relevel(dat[,11], ref = "n") out1 <- glm(take ~ lgpa + gpax, data = dat, family = binomial()) summary(out1) coef(out1) exp(coef(out1)) exp(confint(out1)) 1:3 7:9 expand.grid(x = 1:3, y = 7:9) iv <- expand.grid(lgpa = seq(1, 4, 0.1), gpax = 1:4) pred <- predict(out1, iv) prob <- exp(pred)/(1 + exp(pred)) data.frame(iv, prob) library(boot) prob2 <- inv.logit(pred) plot(iv[,1], inv.logit(pred), type = "n", xlab = "Last Semester GPA", ylab = "P(Cheating Helped by Others)") ivwithprob <- data.frame(iv, prob = inv.logit(pred)) with(ivwithprob[ivwithprob$gpax == 1,], lines(lgpa, prob, col = "orange")) with(ivwithprob[ivwithprob$gpax == 2,], lines(lgpa, prob, col = "green")) with(ivwithprob[ivwithprob$gpax == 3,], lines(lgpa, prob, col = "blue")) with(ivwithprob[ivwithprob$gpax == 4,], lines(lgpa, prob, col = "black")) legend("topleft", paste0("GPAX = ", 1:4), lty = 1, col = c("orange", "green", "blue", "black")) out2 <- glm(take ~ lgpa + gpax + university, data = dat, family = binomial()) anova(out1, out2, test = "Chisq") summary(out2) university <- factor(1:7) iv2 <- expand.grid(university = university, lgpa = 3, gpax = 3) inv.logit(predict(out2, iv2)) predict(out2, iv2, type = "response") cdplot(take ~ lgpa, data = dat) cdplot(take ~ gpax, data = dat) out3 <- glm(take ~ lgpa + gpax + I(gpax^2), data = dat, family = binomial()) summary(out3) anova(out1, out3, test = "Chisq") iv <- expand.grid(lgpa = seq(1, 4, 0.1), gpax = 1:4) pred <- predict(out3, iv) prob <- exp(pred)/(1 + exp(pred)) data.frame(iv, prob) plot(iv[,1], inv.logit(pred), type = "n", xlab = "Last Semester GPA", ylab = "P(Cheating Helped by Others)") ivwithprob <- data.frame(iv, prob = inv.logit(pred)) with(ivwithprob[ivwithprob$gpax == 1,], lines(lgpa, prob, col = "orange")) with(ivwithprob[ivwithprob$gpax == 2,], lines(lgpa, prob, col = "green")) with(ivwithprob[ivwithprob$gpax == 3,], lines(lgpa, prob, col = "blue")) with(ivwithprob[ivwithprob$gpax == 4,], lines(lgpa, prob, col = "black")) legend("topleft", paste0("GPAX = ", 1:4), lty = 1, col = c("orange", "green", "blue", "black")) out4 <- glm(take ~ lgpa + gpax + I(gpax^2) + university, data = dat, family = binomial()) summary(out4) anova(out3, out4, test = "Chisq") out5 <- glm(take ~ sex*university, data = dat, family = binomial()) summary(out5) car::Anova(out5, type = 3) library(phia) testInteractions(out5, fixed = "sex", across = "university") testInteractions(out5, fixed = "university", across = "sex") plot(interactionMeans(out5)) ## Ordinal Logistic Regression dat <- data.frame(dat, cheatlvl = ordered((dat[,9] == "y") + (dat[,10] == "y") + (dat[,11] == "y"))) library(MASS) out6 <- polr(cheatlvl ~ lgpa + gpax + university, data = dat, Hess=TRUE) sumout6 <- summary(out6) tval <- sumout6[["coefficients"]][,3] pvalue <- round(pnorm(-abs(tval)) * 2, 4) exp(coef(out6)) exp(confint(out6)) iv6 <- expand.grid(university = factor(1:7), lgpa = 3, gpax = 3) predict(out6, newdata = iv6, "probs") cdplot(cheatlvl ~ university, data = dat) cdplot(cheatlvl ~ lgpa, data = dat) cdplot(cheatlvl ~ gpax, data = dat) # How to transform iv values to probability iv6 <- expand.grid(university = dat$university[!duplicated(dat$university)], lgpa = 2.5, gpax = 2.5) iv6 <- model.matrix(~ university + lgpa + gpax, data = iv6)[,-1] pred <- iv6 %*% coef(out6)[colnames(iv6)] zeta <- out6[["zeta"]] cumprobs <- inv.logit(outer(zeta, pred, "-")) cumprobs <- rbind(0, cumprobs[,,1], 1) probs <- cumprobs[2:5,] - cumprobs[1:4,] ########## Multinomial Logistic Regression self <- as.numeric((dat[,9] == "y" | dat[,11] == "y")) other <- as.numeric((dat[,10] == "y")) grp <- factor((1 - self) * other + 2*self, labels = c("nocheat", "other", "self")) dat <- data.frame(dat, grp = relevel(grp, ref = "nocheat")) library(nnet) out7 <- multinom(grp ~ lgpa + gpax + university, data = dat, Hess = TRUE) summary(out7) est <- summary(out7)[["coefficients"]] se <- summary(out7)[["standard.errors"]] tval <- est/se pvalue <- pnorm(-abs(tval))*2 iv7 <- expand.grid(university = factor(1:7), lgpa = 3, gpax = 3) predict(out7, newdata = iv7, "probs") cdplot(grp ~ university, data = dat) # Poisson Regression out8 <- glm(I(as.numeric(cheatlvl)) ~ lgpa + gpax + university, data = dat, family = poisson()) summary(out8) iv8 <- expand.grid(university = dat$university[!duplicated(dat$university)], lgpa = 3, gpax = 3) exp(predict(out8, iv8)) predict(out8, iv8, type = "response") # Negative Binomial Regression out9 <- glm.nb(I(as.numeric(cheatlvl)) ~ lgpa + gpax + university, data = dat, control = glm.control(maxit = 1000)) summary(out9) iv9 <- expand.grid(university = dat$university[!duplicated(dat$university)], lgpa = 3, gpax = 3) exp(predict(out9, iv9)) predict(out9, iv9, type = "response") X2 <- 2 * (logLik(out9) - logLik(out8)) X2 pchisq(X2, df = 1, lower.tail=FALSE) ####################### ssgroup <- cut(dat$SS_ALL, c(-Inf, quantile(dat$SS_ALL, c(0.25, 0.50, 0.75)), Inf)) dat <- data.frame(dat, ssgroup) out6 <- polr(ssgroup ~ sex + edyear + FGROUP, data = dat, Hess=TRUE) out62 <- polr(ssgroup ~ sex + edyear + FGROUP + university, data = dat, Hess=TRUE) summary(out6) coef(out6) vcov(out6) exp(coef(out6)) exp(confint(out6)) anova(out1, out2, test = "Chisq") out6[["zeta"]] medmas <- median(dat$MASTERY) medper <- median(dat$PERFORM) gmas <- dat$MASTERY > medmas gper <- dat$PERFORM > medper style <- rep(NA, nrow(dat)) style[gmas & gper] <- 1 style[!gmas & gper] <- 2 style[gmas & !gper] <- 3 style[!gmas & !gper] <- 4 style <- factor(style, labels = c("both", "mastery", "performance", "neither")) style <- relevel(style, ref = "neither") dat <- data.frame(dat, style) library(nnet) out7 <- multinom(style ~ lgpa + gpax, data = dat, Hess = TRUE) summary(out7) est <- summary(out7)[["coefficients"]] se <- summary(out7)[["standard.errors"]] tval <- est/se pvalue <- pnorm(-abs(tval))*2 out7a <- multinom(style ~ lgpa + gpax + FGROUP, data = dat, Hess = TRUE) summary(out7a) anova(out7, out7a) est <- summary(out7a)[["coefficients"]] se <- summary(out7a)[["standard.errors"]] tval <- est/se pvalue <- pnorm(-abs(tval))*2