### Name: Analysis for the application test of the UQMP
### Last modified: 16/03/2023
### Authors: Anesa Aljovic, Benedikt Iberl


### setting options

options(scipen = 999)
set.seed(123)



### data preparation

# set working directory
# setwd()

# read data
dat <- read.csv("data.csv", header = TRUE, sep = ";", skipNul = TRUE)

# exclude irrelevant columns
dat <- dat[-c(2, 3, 4, 5, 44, 45, 47)]

# rename columns
names(dat) <- c("id", "start", "uqm.alk", "anonym", "sensitive",
		"alk.dq.1w", "alk.dq.1m", "alk.dq.6m", "alk.dq.12m",
		"urne1", "uqm.vs.dq", "urne2", "uqm.time", "urne3",
		"pois.time", "drivecheck", "attcheck", "gender", "age", 
		"education", "quotegender", "quoteage", "bil.res.ID",
		"eyecol.dq", "eyecol.uqm", "part.conf", "del.code", "time1",
		"time2", "time3", "time4", "time5", "time6", "time7", "time8",
		"time9", "time10", "time11", "timesum", "finished",
		"lastpage", "maxpage", "missing", "missrel", "time.rsi",
		"deg.time")

# transforming "1 and 2"-variables into indicator variables (0 and 1)
qst <- c("uqm.alk", "alk.dq.1w", "alk.dq.1m", "alk.dq.6m", "alk.dq.12m", 
	 "eyecol.dq", "eyecol.uqm")
for(i in qst){
	dat[i] <- dat[i]*(-1)+2
}

# transforming demographic variables into factors
trf.gender <- c("w", "m", "d")

for(i in 1:length(trf.gender)){
	dat$gender[dat$gender == i] <- trf.gender[i]
}
dat$gender <- as.factor(dat$gender)

trf.education <- c("none", "grundhaupt", "real", "abitur", "ausbildung", "uni")

for(i in 1:length(trf.education)){
	dat$education[dat$education == i] <- trf.education[i]
}
dat$education <- as.factor(dat$education)



### data exclusion

# excluding incomplete data
dat <- subset(dat, maxpage == 11)

# excluding anyone who failed the attention test (chosing option 4, "London")
dat <- subset(dat, attcheck == 4)

# excluding participants answering considerably quicker than average
# (RSI >= 2.0 according to Leiner)

# shared pages for all groups: 1, 2, 3, 8, 11 -> common median
# non-shared pages: 4, 5, 6, 7, 9, 10 -> group median

rsi   <- numeric(nrow(dat))
rsis  <- numeric(nrow(dat))
nacnt <- numeric(nrow(dat))

for(i in 1:nrow(dat)) {
  for(j in 28:38) {
    
    if (j <= 30 | j == 35 | j == 38) {
      a <- median(dat[,j], na.rm = TRUE)/dat[i,j]
      if (is.na(a) == TRUE) {a <- 0}
      else if (a > 3) {a <- 3} else {}
      rsis[i] <- rsis[i] + a
    }
    
    else {
      
      if (dat[i,]$uqm.vs.dq != 4) {
        a <- median(dat[,j][dat$uqm.vs.dq != 4], na.rm = TRUE)/dat[i,j]
        if (is.na(a) == TRUE) {a <- 0}
        else if (a > 3) {a <- 3} else {}
        rsis[i] <- rsis[i] + a
      }
    
      else if (dat[i,]$uqm.vs.dq == 4) {
        a <- median(dat[,j][dat$uqm.vs.dq == 4], na.rm = TRUE)/dat[i,j]
        if (is.na(a) == TRUE) {a <- 0}
        else if (a > 3) {a <- 3} else {}

        rsis[i] <- rsis[i] + a
      } 
    }
  }

  nacnt[i] <- length(which(is.na(dat[i, 28:38])))
  rsi[i]   <- rsis[i]/(length(28:38) - nacnt[i])
}

# excluding participants with RSI >= 2.0
dat <- cbind(dat, rsi)
dat <- dat[dat$rsi < 2.0,]

# forming separate data frames for uqm and dq groups
datuqm <- subset(dat, uqm.vs.dq == 1 | uqm.vs.dq == 2 | uqm.vs.dq == 3)
datdq  <- subset(dat, uqm.vs.dq == 4)



### sample description

## demographics
# correcting false age inputs
dat$age[dat$age ==  667] <- 67
dat$age[dat$age == 1990] <- 32

# age statistics
age.sum <- summary(dat$age)
age.sd  <- sd(dat$age, na.rm=TRUE)

# age quota statistics
dat$quoteage <- factor(dat$quoteage,
		       levels = c(2,3,4,5,6),
		       labels = c("18-29", "30-39", "40-49", "50-59","60+"))
age.perc <- round(table(dat$quoteage)/sum(table(dat$quoteage))*100, 1)

# sex statistics
sex.tab  <- table(dat$gender)
sex.perc <- round(table(dat$gender)/sum(table(dat$gender))*100, 1)


# education statistics
dat$education <- factor(dat$education,
			levels = c("none", "grundhaupt", "real", "abitur",
				   "ausbildung", "uni"),
			labels = c("none", "grundhaupt", "real",
				   "abitur", "ausbildung", "uni"))
edu.perc <- round(table(dat$education)/sum(table(dat$education))*100, 1)


## completion time
# completion time statistics for all participants
time.sum <- summary(dat$timesum)
time.sd  <- sd(dat$timesum)

# completion time statistics for participants in uqm groups
tuqm.sum <- summary(datuqm$timesum)
tuqm.sd  <- sd(datuqm$timesum)

# completion time statistics for participants in dq groups
tdq.sum  <- summary(datdq$timesum)
tdq.sd   <- sd(datdq$timesum)


## group statistics
# general group statistics (uqm vs. dq)
Ngroup.uqm <- sum(table(dat$uqm.vs.dq)[1:3]) 		# uqm
Ngroup.dq  <- table(dat$uqm.vs.dq)[4] 			# dq

# uqm group statistics (t_j)
Ntime.uqm  <- table(datuqm$uqm.time)

# dq group statistics (t_j)
Ntime.dq   <- table(datdq$pois.time)



### data analysis: eye color

## DQ
# prevalence
ec.dq <- mean(dat$eyecol.dq, na.rm = TRUE)

# CI
n        <- sum(table(dat$eyecol.dq))
margin   <- qnorm(0.975)*sqrt(ec.dq*(1-ec.dq)/n)
ec.dq.CI <- c(ec.dq - margin, ec.dq + margin)


## UQM
# design parameters and observed variables
p     <- 245.25/365.25		# prob. of getting sensitive question
q     <- 181.25/365.25		# prob. of answering "yes" to neutral question
gamma <- mean(dat$eyecol.uqm, na.rm = TRUE)

# UQM formula pi
UQM_fct <- function(gamma, p, q) {
  pi_s  <- (gamma - (1-p)*q) / p
  print(pi_s)
}

# UQM formula sigma
UQM_var  <- function(gamma, p, n) {
  var_pi <- (gamma * (1-gamma)) / (n*p^2)
  print(var_pi)
}

# UQM formula CI
UQM_ci   <- function(gamma, p, q, n) {
  pi_s   <- (gamma - (1-p)*q) / p
  var_pi <- (gamma * (1-gamma)) / (n*p^2)
  ci     <- c(pi_s - qnorm(0.975)*sqrt(var_pi),
	      pi_s + qnorm(0.975)*sqrt(var_pi))
  print(ci)
}

# prevalence and statistics
pi_rrt  <- UQM_fct(gamma, p, q)			
ci_rrt  <- UQM_ci( gamma, p, q, sum(table(dat$eyecol.uqm)))
var_rrt <- UQM_var(gamma, p, sum(table(dat$eyecol.uqm)))



### data analysis: questionnaire impression

# anonymity
anon.sum <- summary(dat$anonym)
anon.sd  <- sd(dat$anonym, na.rm=TRUE)
anon.tt  <- t.test(datuqm$anonym, datdq$anonym, paired=FALSE)

# sensitivity
sens.sum <- summary(dat$sensitive)
sens.sd  <- sd(dat$sensitive, na.rm=TRUE)
sens.tt  <- t.test(datuqm$sensitive, datdq$sensitive, paired=FALSE)



### data analysis: drinking and driving

## parameters
# values for parameter estimation
lim        <- 1e-10				# lower limit for lambda (0)
up_lim_lam <- 10				# upper limit for lambda (7)

# time frames t_j for all groups j = 1, 2, 3, 4
t0 <- c(.25, 1, 6, 12)


## DQ / standard Poisson model

# observed values
# N (participants per subgroup/time-frame)
N.t.dq <- c(sum(table(dat$alk.dq.1w)), sum(table(dat$alk.dq.1m)),
            sum(table(dat$alk.dq.6m)), sum(table(dat$alk.dq.12m)))
# a (yes-answers per subgroup/time-frame)
a.dq   <- c(sum(dat$alk.dq.1w, na.rm=TRUE), sum(dat$alk.dq.1m, na.rm=TRUE),
            sum(dat$alk.dq.6m, na.rm=TRUE), sum(dat$alk.dq.12m, na.rm=TRUE))
# b (no-answers per subgroup/time-frame)
b.dq   <- N.t.dq - a.dq

# function of the prevalence curve
pc.dq  <- function(t, pd, lam) {pd*(1-exp(-lam*t))}

# log-likelihood-function
MLE.dq   <- function(par, a, b){
  pd     <- par[1]
  lam    <- par[2]
  
  pyes   <- pc.dq(t0, pd, lam)
  
  lL     <- a*log(pyes) + b*log(1-pyes)
  
  MLE.dq <- -sum(lL)
}

# G function for testing model fit
Gf.dq     <- function(par, a, b){
  pd      <- par[1]
  lam     <- par[2]
  N.t     <- a + b
  
  E.t.yes <- pc.dq(t0, pd, lam)*N.t
  E.t.no  <- N.t - E.t.yes
  
  G       <- 2*sum(a*log(a/E.t.yes) +
		   b*log(b/E.t.no))
  
  return(G)
}


# parameter estimation via bootstrap sampling
# number of bootstrap samples
nb <- 1000

# "vectorizing" observed yes- and no-answers for every group
obs.t1.dq <- c(rep(1, a.dq[1]), rep(0, b.dq[1]))
obs.t2.dq <- c(rep(1, a.dq[2]), rep(0, b.dq[2]))
obs.t3.dq <- c(rep(1, a.dq[3]), rep(0, b.dq[3]))
obs.t4.dq <- c(rep(1, a.dq[4]), rep(0, b.dq[4]))

# bootstrap sampling
PI.b.dq   <- numeric(nb)
lam.b.dq  <- numeric(nb)

for(i in 1:nb){
  # resampling a and b from observed data
  a.b.dq <- c(sum(sample(x = obs.t1.dq, size = N.t.dq[1], replace = TRUE)),
	      sum(sample(x = obs.t2.dq, size = N.t.dq[2], replace = TRUE)),
	      sum(sample(x = obs.t3.dq, size = N.t.dq[3], replace = TRUE)),
	      sum(sample(x = obs.t4.dq, size = N.t.dq[4], replace = TRUE)))
  b.b.dq <- N.t.dq - a.b.dq
  
  # maximum likelihood estimation of the redrawn sample
  ML.dq  <- optim(par = c(0.5, 0.8), a = a.b.dq, b = b.b.dq,
		  fn = MLE.dq, method = 'L-BFGS-B',
		  lower = c(lim, lim),
		  upper = c(1-lim, up_lim_lam))

  # extracting pi and lambda estimates
  PI.b.dq[i]  <- ML.dq$par[1]
  lam.b.dq[i] <- ML.dq$par[2]
}

# extracting parameter estimators, SEs and 95%-CIs
PI.m.dq   <- mean(PI.b.dq)			# point estimate for pi
PI.se.dq  <- sd(PI.b.dq)			# standard error for pi
PI.ci.dq  <- quantile(PI.b.dq, c(.025, .975))	# 95%-CI for pi

lam.m.dq  <- mean(lam.b.dq)			# point est. for lambda
lam.se.dq <- sd(lam.b.dq)			# se for lambda
lam.ci.dq <- quantile(lam.b.dq, c(.025, .975))	# .95-CI for lambda

# G-test
Gest.dq <- optim(c(0.5,0.8), a = a.dq, b = b.dq,
		 fn = Gf.dq,
		 method='L-BFGS-B',
		 lower = c(lim, lim),
		 upper = c(1-lim, up_lim_lam))

# extracting G-value
Gval.dq <- Gest.dq$value
# computing p-value for G-test decision
pval.dq <- pchisq(Gest.dq$value, df = 1, lower.tail = FALSE)


# prevalence curve graphic
curve(PI.m.dq*(1-exp(-lam.m.dq*x)), ylim = c(0, 1), xlim = c(0, 12),
      xlab = "time [months]", ylab = "P('yes' | t)", main = "DQ",
      cex.main = 1)
# abline(h = PI.m.dq, col = "gray30", lty = 2)	# asymptote
# add 95%-CIs
for(j in 1:length(t0)){
	arrows(t0[j], a.dq[j]/N.t.dq[j] + .007,
	       t0[j], a.dq[j]/N.t.dq[j] + qnorm(.975) * 
	       		sqrt(a.dq[j]/N.t.dq[j]*(1-a.dq[j]/N.t.dq[j]) /
				     N.t.dq[j]),
	       length = .05, angle = 90)
	arrows(t0[j], a.dq[j]/N.t.dq[j] - .007,
	       t0[j], a.dq[j]/N.t.dq[j] - qnorm(.975) *
	       		sqrt(a.dq[j]/N.t.dq[j]*(1-a.dq[j]/N.t.dq[j]) /
				     N.t.dq[j]),
	       length = .05, angle = 90)
}
# add points
points(t0, a.dq/N.t.dq, pch = 1)



## UQM / Poisson extension for the UQM

# N (participants per subgroup/time-frame)
N.t.uqm <- as.vector(table(dat$uqm.time[dat$uqm.vs.dq != 4]))
# a (yes-answers per subgroup/time-frame)
a.uqm   <- c(sum(dat$uqm.alk[dat$uqm.time == 1 & dat$uqm.vs.dq != 4]),
	     sum(dat$uqm.alk[dat$uqm.time == 2 & dat$uqm.vs.dq != 4]),
	     sum(dat$uqm.alk[dat$uqm.time == 3 & dat$uqm.vs.dq != 4]),
	     sum(dat$uqm.alk[dat$uqm.time == 4 & dat$uqm.vs.dq != 4]))
# b (no-answers per subgroup/time-frame)
b.uqm   <- N.t.uqm - a.uqm

# function of the UQM prevalence curve
pc.uqm  <- function(t,p,q,PI,lam){p*PI*(1-exp(-lam*t))+(1-p)*q}

# log-likelihood-function
MLE.uqm <- function(par, a, b){
	PI      <- par[1]
	lam     <- par[2]

	pyes    <- pc.uqm(t0,p,q,PI,lam)

	lLu     <- a*log(pyes) + b*log(1-pyes)

	MLE.uqm <- -sum(lLu)
}

# G function for testing model fit
Gf.uqm <- function(par, a, b){
	PI      <- par[1]
	lam     <- par[2]
	N.t     <- a + b
        
        E.t.yes <- pc.uqm(t0,p,q,PI,lam)*N.t
        E.t.no 	<- N.t - E.t.yes
                
        G.uqm   <- 2*sum(a*log(a/E.t.yes) +
			 b*log(b/E.t.no))
        return(G.uqm) 
}

# parameter estimation via bootstrap sampling
# "vectorizing" observed yes- and no-answers for every group
obs.t1.uqm <- c(rep(1, a.uqm[1]), rep(0, b.uqm[1]))
obs.t2.uqm <- c(rep(1, a.uqm[2]), rep(0, b.uqm[2]))
obs.t3.uqm <- c(rep(1, a.uqm[3]), rep(0, b.uqm[3]))
obs.t4.uqm <- c(rep(1, a.uqm[4]), rep(0, b.uqm[4]))

# bootstrap sampling
PI.b.uqm   <- numeric(nb)
lam.b.uqm  <- numeric(nb)

for(i in 1:nb){
  # resampling a and b from observed data
  a.b.uqm <- c(sum(sample(x = obs.t1.uqm, size = N.t.uqm[1], replace = TRUE)),
	       sum(sample(x = obs.t2.uqm, size = N.t.uqm[2], replace = TRUE)),
	       sum(sample(x = obs.t3.uqm, size = N.t.uqm[3], replace = TRUE)),
	       sum(sample(x = obs.t4.uqm, size = N.t.uqm[4], replace = TRUE)))
  b.b.uqm <- N.t.uqm - a.b.uqm
  
  # maximum likelihood estimation of the redrawn sample
  ML.uqm <- optim(par = c(0.5, 0.8), a = a.b.uqm, b = b.b.uqm,
		  fn = MLE.uqm, method = 'L-BFGS-B',
		  lower = c(lim, lim), 
		  upper = c(1-lim, up_lim_lam))
  
  # extracting pi and lambda estimates
  PI.b.uqm[i]  <- ML.uqm$par[1]
  lam.b.uqm[i] <- ML.uqm$par[2]
}

# extracting parameter estimators, SEs and 95%-CIs
PI.m.uqm   <- mean(PI.b.uqm)			# point estimate for pi
PI.se.uqm  <- sd(PI.b.uqm)			# standard error for pi
PI.ci.uqm  <- quantile(PI.b.uqm, c(.025, .975))	# 95%-CI for pi

lam.m.uqm  <- mean(lam.b.uqm)			# point est. for lambda
lam.se.uqm <- sd(lam.b.uqm)			# se for lambda
lam.ci.uqm <- quantile(lam.b.uqm, c(.025, .975))# 95%-CI for lambda

# G-test
Gest.uqm   <- optim(c(0.5,0.8), a = a.uqm, b = b.uqm,
		    fn = Gf.uqm,
		    method='L-BFGS-B',
		    lower = c(lim, lim),
		    upper = c(1-lim, up_lim_lam))

# extracting G-value
Gval.uqm <- Gest.uqm$value
# computing p-value for G-test decision
pval.uqm <- pchisq(Gest.uqm$value, df = 1, lower.tail = FALSE)

# confidence intervals for UQM pi estimates
ci.uqm   <- UQM_ci(gamma = a.uqm/N.t.uqm, p = p, q = q, n = N.t.uqm)


# UQM prevalence curve graphic
curve(PI.m.uqm*(1-exp(-lam.m.uqm*x)), ylim = c(0, 1),
      xlim = c(0, 12), xlab = "time [months]",
      ylab = "P('yes' | t, sensitive question)",
      main = "UQM", cex.main = 1)
# abline(h = PI.m.uqm)				# asymptote
for(j in 1:length(t0)){
	arrows(t0[j], ((a.uqm[j]/N.t.uqm[j]) - (1-p)*q) / p + .007,
	       t0[j], ci.uqm[j+4],
	       length = .05, angle = 90)
	arrows(t0[j], ((a.uqm[j]/N.t.uqm[j]) - (1-p)*q) / p - .007,
	       t0[j], ci.uqm[j],
	       length = .05, angle = 90)
}
# add points
points(t0, ((a.uqm/N.t.uqm) - (1-p)*q)/p, pch = 1)


