
# Calculate several indicators for the given data
# @param data.frame data
# @param list vars List with elements "optional", "compulsory", "scales" und "times"
# @return data.frame
dcIndicators = function(data, vars) {
    indicators = data.frame(row.names = row.names(data))
    indicators$condition = data$condition
    
    ##### Find variables with a certain pattern and max. 20% missing data
    getUsableVars = function(x, pattern=NULL) {
        if (is.null(pattern)) {
            vars = names(x)
        } else {
            vars = grep(pattern, names(x), value=T)
        }
        usable = names(x[vars])[colSums(is.na(x[vars])) <= 0.2 * nrow(x)]
        return(usable)
    }
    
    ##### "qmiss" = "Item Non-Response (percentage)",
    indicators$qmiss = data$MISSING
    if ("optional" %in% names(vars)) {
        indicators$qmis2 = rowSums(is.na(data[vars$optional])) / length(vars$optional)
    }
    
    ##### "qmisr" = "Item Non-Response (weighted)"
    weights = 1 - (colSums(is.na(data[vars$optional])) / nrow(data))
    weights = weights / sum(weights) # sum normed to 1
    fWeight = function(x, wt) {
        return(x * wt)
    }
    # Weight each NA with the non-NA-probability
    indicators$qmisr = data$MISSREL
    if ("optional" %in% names(vars)) {
        indicators$qmir2 = colSums(apply(is.na(data[vars$optional]), fWeight, wt=weights, MARGIN=1))
    }
    rm(weights, fWeight)
    
    ##### "qmidk" = "Don't Know Responses"
    ### Using all optional questions (that have been shown to all respondents)
    ### Export will transform DK responses to NA
    ### DK was available in S1/S2, only - and only for the issues (that could not be omitted)
    if ("compulsory" %in% names(vars)) {
        # Note: NAs can occur when completed without JS
        indicators$qmidk = rowSums((data[vars$compulsory] == "%dk%"), na.rm=T) / length(vars$compulsory)
    }
    
    ##### "qmisd" = "Item Non-Response or Don't Know",
    if ("compulsory" %in% names(vars)) {
        indicators$qmisd = rowSums(cbind((data[vars$compulsory] == "%dk%"), is.na(data[vars$optional])), na.rm=T) / length(c(vars$compulsory,vars$optional))
    }
    
    ##### "qtime" = "Fast Completion (Absolute Time)"
    indicators$qtime = -rowSums(data[vars$times])
    
    ##### "qtimo" = "Fast Completion (Absolute Time, Pauses Removed)",
    tmp = data[vars$times]
    for (var in vars$times) {
        pc = quantile(tmp[[var]], c(0.25,0.5,0.75), na.rm=T)
        iqr = as.numeric(pc[3] - pc[1])
        # Remove pauses
        if (pc[2] > 3600) {
            tmp[[var]] = 0
        } else {
            # Replace missing and 3sd outliers by median (if any)
            tmp[is.na(tmp[[var]]) | (tmp[[var]] > pc[2] + 3 / 1.34 * iqr), var] = pc[2]
        }
    }
    indicators$qtimo = -rowSums(tmp)
    rm(tmp, pc, iqr, var)
    
    ##### "qfast" = "Fast Completion (RSI with clipping to 3)",
    completion.RSI3 = function(times) {
        mdn = apply(times, median, na.rm=T, MARGIN=2)
        rsi3 = function(x, md) {
            val = md / x
            val[val > 3] = 3
            return(mean(val, na.rm=T))
        }
        return(as.numeric(apply(times, rsi3, md=mdn, MARGIN=1)))
    }
    
    indicators$qfast = completion.RSI3(data[vars$times])
    rm(completion.RSI3)
    
    
    ##### "qfas2" = "Fast Completion (RSI with medians)",
    completion.RSIm = function(times) {
        relative = data.frame(row.names=row.names(times))
        for (pg in names(times)) {
            # Compute relative speed per case
            relative[[pg]] = median(times[[pg]], na.rm=T) / times[[pg]]
        }
        # Median of relative time (per case)
        # is the relative speed index (RSI)
        mdn = apply(relative, FUN=median, MARGIN=1, na.rm=T) # Notiz: Das ist für SPSS ein wenig schwierig
        return(mdn)
    }
    indicators$qfas2 = completion.RSIm(data[vars$times])
    rm(completion.RSIm)
    
    ##### "qstrl" = "Straightlining (no. of straightlined scales)",
    ## TODO: Fehlende Werte in der Analyse von Muster etc. weglassen
    straight.scales = function(x, scales) {
        sl = rep(0, nrow(x))
        for (prefix in scales) {
            vars = grep(paste("^", prefix, sep=""), names(x), value=T)
            sub = x[vars]
            sl = suppressWarnings(
                sl + (apply(sub, FUN=min, MARGIN=1, na.rm=T) == apply(sub, FUN=max, MARGIN=1, na.rm=T))
            )
        }
        return(sl)
    }
    indicators$qstrl = straight.scales(data, vars$scales)
    
    straight.items = function(x, scales) {
        sl = rep(0, nrow(x))
        for (prefix in scales) {
            vars = grep(paste("^", prefix, sep=""), names(x), value=T)
            sub = x[vars]
            sl = suppressWarnings(
                sl + length(vars) *  (apply(sub, FUN=min, MARGIN=1, na.rm=T) == apply(sub, FUN=max, MARGIN=1, na.rm=T))
            )
        }
        return(sl)
    }
    indicators$qstri = straight.items(data, vars$scales)
    
    ##### "qstlg" = "Longest String (max. length of same-answer-sequences)",
    longest.string = function(x, na.rm=TRUE) {
        if (na.rm) {
            x = x[!is.na(x)];
        } else {
            x[is.na(x)] = "~ NA ~";
        }
        length = 0
        longest = 0
        last = -Inf
        for (v in x) {
            if (v == last) {
                length = length + 1;
            } else {
                if (length > longest) {
                    longest = length
                }
                length = 1;
            }
            last = v
        }
        if (length > longest) {
            longest = length
        }
        return(longest)
    }
    
    tmpSL = data.frame(row.names = row.names(data))
    for (var in vars$scales) {
        tmpVs = grep(paste("^", var, sep=""), names(data), value=T)
        tmpSL[[var]] = apply(data[tmpVs], MARGIN=1, FUN=longest.string)
    }
    indicators$qstlg = apply(tmpSL, MARGIN=1, FUN=max, na.rm=T)
    rm(longest.string, tmpSL, var, tmpVs)
    
    ##### "qsdba" = "Straightlining (small SD)",
    ##### "qsdaa" = "Crude Clicking (large SD)",
    ##### "qsdoa" = "Untypical Deviation (SD off average)",
    scales.sd = function(x, scales) {
        sds = data.frame(row.names = row.names(x))
        for (var in scales) {
            vars = grep(paste("^", var, sep=""), names(x), value=T)
            sds[var] = apply(x[vars], MARGIN=1, FUN=sd, na.rm=T) / length(unique(unlist(x[vars])))
        }
        return(rowMeans(sds, na.rm=T))
    }
    
    indicators$qsdaa = scales.sd(data, vars$scales)
    indicators$qsdba = -indicators$qsdaa
    # indicators$qsdoa = abs(indicators$qsdaa - mean(indicators$qsdaa, na.rm=T))
    # Take care that cases on both ends are found equally
    indicators$qsdoa = abs(order(indicators$qsdaa) - mean(order(indicators$qsdaa)))
    rm(scales.sd)
    
    
    ##### "qptal" = "Linear Patterns (algorithmic)",
    
    qptal = function(x, na.rm=T) {
        # Remove NA
        if (na.rm) {
            x = x[!is.na(x)]
        }
        # Need differences
        y = diff(x)
        if (length(y) < 2) {
            # No prediction possible
            return(0);
        }
        # Go thorugh the points
        c1 = NA
        c2 = NA
        points = 0
        for (v in y) {
            if (v == 0) {
                # One point for each unchanged value
                points = points + 1
            } else if (!is.na(c1) & (v == c1)) {
                # One point for each change like the one before
                points = points + 1
            } else if (!is.na(c2) & (v == c2)) {
                # Half point for each change like the one two before
                points = points + 0.5
            }
            c2 = c1
            c1 = v
        }
        return(points/length(y))
    }
    
    tmpAS = data.frame(row.names = row.names(data))
    for (tmpScale in vars$scales) {
        tmpVs = grep(paste("^", tmpScale, sep=""), names(data), value=T)
        tmpAS[tmpScale] = apply(data[tmpVs], MARGIN=1, FUN=qptal)
    }
    indicators$qptal = rowMeans(tmpAS,na.rm=T)
    rm(tmpVs, tmpScale, tmpAS, qptal)
    
    
    ##### "qptas" = "High Alternators (average 2nd derivation in scales)",
    ##### "qptba" = "Linear Patterns (small 2nd derivation)",
    ##### "qptoa" = "Untypical Derivation (2nd derivation off average)",
    qptas = function(x) {
        return(mean(abs(diff(diff(x)))))
    }
    tmpAS = data.frame(row.names = row.names(data))
    for (tmpScale in vars$scales) {
        tmpVs = grep(paste("^", tmpScale, sep=""), names(data), value=T)
        tmpAS[tmpScale] = apply(data[tmpVs], MARGIN=1, FUN=qptas)
    }
    indicators$qptas = rowMeans(tmpAS,na.rm=T)
    rm(tmpVs, tmpAS, tmpScale, qptas)
    indicators$qptba = -indicators$qptas
    indicators$qptoa = abs(order(indicators$qptas) - mean(order(indicators$qptas), na.rm=T))
    
    
    ##### "qsdst" = "Distance from Sample Mean (simple average)"
    ## Limited to numeric variables, ans especially to scale items (forget about the 2 or three other scale-like selections)
    varsScale = grep(paste("^(", paste(vars$scales, collapse="|"), ")", sep=""), names(data), value=T)
    varsMean = colMeans(data[varsScale], na.rm=T)
    varsSD = sapply(data[varsScale], FUN=sd, na.rm=T)
    indicators$qsdst = rowMeans(abs((data[varsScale] - varsMean) / varsSD), na.rm=T)
    
    
    ##### "qmdst" = "Distance from Sample Mean (Multivariate/Mahalanobis)"
    # Must remove variables with missing data (> 20%)
    usable = getUsableVars(data[varsScale])
    drop = varsScale[!(varsScale %in% usable)]
    if (length(drop) > 0) {
        print(paste("Dropping variables from distance index due to missing data (> 20%): ", paste(drop, collapse=", "), sep=""))
    }
    # Replace missing by mean
    tmp = data[usable]
    for (var in usable) {
        tmp[is.na(tmp[[var]]), var] = varsMean[[var]]
    }
    indicators$qmdst = mahalanobis(tmp, varsMean, cov=cov(tmp, use="pairwise.complete.obs"))
    indicators$qmds2 = mahalanobis(data[usable], varsMean, cov=cov(data[usable], use="pairwise.complete.obs"))
    cor(indicators$qmdst, indicators$qmds2, use="pairwise.complete.obs") # 1.0
    rm(tmp, var, varsMean, varsScale, varsSD, usable)
    
    
    ##### "qcons" = "Inconsistency (correlation of split-half scales)"
    
    # For each respondent calculate to means per scale (split-half means)
    # and then tell the correlation between the means throughout all scales
    # (these are very few data points - only one per scale!)
    dcConsistency = function(x, scales) {
        # Data from scales
        data = data.frame(row.names = row.names(x))
        sets = c()
        for (scale in scales) {
            vars = grep(paste("^", scale, "_\\d+$", sep=""), names(x), value=T)
            # Skip variables with missing data (> 20%)
            copy = x[vars]
            usable = getUsableVars(copy, pattern=paste("^", scale, "_\\d+$", sep=""))
            drop = names(copy)[!(names(copy) %in% usable)]
            if (length(drop) > 0) {
                print(paste("Dropping variables from consistency index due to missing data (> 20%): ", paste(drop, collapse=", "), sep=""))
                copy = copy[usable]
            }
            if (length(copy) == 0) {
                next
            }
            # invert negative items
            main = principal(copy, missing=T)
            pc = main$scores
            for (var in names(copy)) {
                cm = cor(copy[[var]], pc, use="pairwise.complete.obs")
                if (is.na(cm)[[1]]) {
                    warning(paste("Failed to correlate", var, "to principal component, n =", nrow(x)))
                } else if (cm < 0) {
                    print(paste("Note: Variable", var, "is correlated negatively with the principal component and will be rotated for dcConsistency"))
                    data[[var]] = -copy[[var]]
                } else {
                    data[[var]] = copy[[var]]
                }
            }
            # store variable names
            half1 = grep(paste("^", scale, "_\\d[13579]$", sep=""), vars, value=T)
            half2 = grep(paste("^", scale, "_\\d[24680]$", sep=""), vars, value=T)
            sets[[var]] = list("a" = half1, "b" = half2)
        }
        
        # Now do the correlation per respondent
        consistency = function(x, sets) {
            half1 = c()
            half2 = c()
            for (set in sets) {
                half1 = c(half1, mean(as.numeric(unlist(x[set$a])), na.rm=T))
                half2 = c(half2, mean(as.numeric(unlist(x[set$b])), na.rm=T))
            }
            if (sum(complete.cases(half1, half2)) < 3) {
                return(NA)
            } else {
                co = cor(half1, half2, use="complete.obs")
                return(co)
            }
        }
        return(apply(data, FUN=consistency, MARGIN=1, sets=sets))
    }
    
    indicators$qcons = -dcConsistency(data, vars$scales)
    
    
    ##### "qrgrs" = "Inconsistency (high within-scale residual)",
    
    dcCaseResiduals = function(x, scales) {
        
        # Residuum-index for each person (non-predictability within a scale)
        # @param data.frame dataFrame
        # @param vector vars
        # @return vector ratings
        scaleResiduum = function(data) {
            resid = data.frame(row.names = rownames(data))
            # Regression for all cases, for each variable, and residuum per case
            vars = names(data)
            for (av in vars) {
                uv = vars[-which(vars == av)]
                fm = as.formula(paste(av, " ~ ", paste(uv, collapse=" + ")))
                fit = lm(fm, data=data, na.action="na.exclude")
                resid[[av]] = residuals(fit)
            }
            
            # Average residual
            return(rowMeans(abs(resid), na.rm=T))
        }
        
        tmpRG = data.frame(row.names = row.names(x))
        for (scale in scales) {
            vars = getUsableVars(x, pattern=paste("^", scale, "_\\d+$", sep=""))
            if (length(vars) > 0) {
                tmpRG[scale] = scaleResiduum(x[vars])
                cat(scale, " M(resid)=", mean(tmpRG[[scale]], na.rm=T), " SD(resid)=", sd(tmpRG[[scale]], na.rm=T), "\n", sep="")
            } else {
                cat(scale, " not found", sep="")
            }
        }
        return(rowMeans(tmpRG,na.rm=T))
    }
    
    indicators$qrgrs = dcCaseResiduals(data, vars$scales)
    
    
    ##### "qrgba" = "Over-Consistency (small within-scale residual)"
    indicators$qrgba = -indicators$qrgrs
    
    ##### "qrgoa" = "Untypical Consistency (residuals off average)",
    indicators$qrgoa = abs(order(indicators$qrgrs) - mean(order(indicators$qrgrs), na.rm=T))
    
    
    ##### "qrg50" = "Unlikely Responses (probability < 20%)",
    ##### "qrgav" = "Unlikely Responses (avg. answer probability, 5 Items)",
    # Lassen wir weg ... da muss man immer Items "am Ende" suchen...
    
    ##### "qimcP" = "Instructional Manipulation Check (perfect)",
    if ("imcP" %in% names(vars)) {
        indicators$qimcP = data[[vars$imcP]]
    }
    ##### "qimcG" = "Instructional Manipulation Check (clicked title)",
    if ("imcG" %in% names(vars)) {
        indicators$qimcG = data[[vars$imcG]]
    }
    ##### "qimcS" = "Simple Instructional Manipulation Check",
    ##### "qimcX" = "Simple IMC and no more than 20% item non-response"
    if ("imcS" %in% names(vars)) {
        indicators$qimcS = ifelse(data[[vars$imcS]], 0, 1)
        indicators$qimcX = ifelse(data[[vars$imcS]] & (data$MISSING <= 20), 0, 1)
    }
    ##### "qires" = "Instructed Response"
    if ("instructed" %in% names(vars)) {
        indicators$qires = data[[vars$instructed]]
    }
    ##### "qbogs" = "Bogus Items"
    if ("bogus" %in% names(vars)) {
        indicators$qbogs = data[[vars$bogus]]
    }
    
    ##### Store in common list
    return(indicators)
}

# Compute performance ratings for the indicator to tell the cases in condition apart
# @param character[] condition Either "EG" (experimental group) or "CG" (control group)
# @param numeric[] indicator Indicator for the case, if the indicator is above the threshold, "EG" is assumed
# @param character desc Description for the indicator
# @param numeric offset Assume this amount (percentage) of cases from the control group (CG) to also contain poor data
# @return data.frame
dcPerformance = function(condition, indicator, desc, offset) {
    # Encountered problems with boolean variables
    indicator = as.numeric(indicator)
    # Cannot fit R² if having extreme outliers
    completes = complete.cases(condition, indicator)
    inM = median(indicator, na.rm=T)
    pc = quantile(indicator, c(0.25,0.5,0.75), na.rm=T)
    iqr = as.numeric(pc[3] - pc[1])
    if (is.na(iqr)) {
        warning(paste("Unable to compute IQR for", desc))
        completes2 = completes
    } else if (iqr > 0) {
        completes2 = completes & (indicator > inM - 100 * iqr) & (indicator < inM + 100 * iqr)
    } else {
        # Okay, there are some indicators with only few descrete values (no. of straight lines)
        completes2 = completes
    }
    
    # Some indicators were computed for only a part of the data
    if (sum(completes) < 0.8 * length(indicator)) {
        condition = condition[completes]
        indicator = indicator[completes]
        completes = complete.cases(condition, indicator)
        completes2 = completes
        cat(paste("Compute", desc, "only for partial data set, N =", length(completes), "\n"))
    }
    
    # The complete data
    cp.condition = condition[completes]
    cp.indicator = indicator[completes]
    
    # R²
    # Need a pseudo R² (there is no R² in logistic regression) -> Nagelkerke's R² is sufficient for the job.
    # {Steyerberg 2009 #3350} -> Basically, the relationship between the LR statistic and Nagelkerke's R² is approximately linear
    R2 = NA
    tryCatch({
        fit = lrm(condition[completes2] ~ indicator[completes2])
        R2 = fit$stats[["R2"]]
    }, warning = function(w) {
    }, error = function(e) {
    })
    
    # CutOff value (based on valid cases, only - anything else won't work)
    targetSize = sum(cp.condition == "EG") + offset * sum(cp.condition == "CG")
    targetPC = targetSize / length(cp.condition)
    pc = quantile(cp.indicator, 1 - targetPC)
    cutoff = pc[[1]]
    targetIS = sum(cp.indicator >= cutoff)
    # Not necessary due to >=
    # if (targetIS < 0.90 * targetSize) {
    #     cat("Changing cutoff ", cutoff, " (n=", targetIS ,") for ", desc, "\n", sep="")
    #     cutoff = cutoff - 1
    #     targetIS = sum(cp.indicator >= cutoff)
    # }
    # Makes no sense to choose the minimum as cutoff
    if (cutoff == min(cp.indicator)) {
        # Find next available value
        tab = table(cp.indicator, useNA="no")
        if (length(tab) > 1) {
            newcut = as.numeric(names(tab)[[2]])
            cat("Changing cutoff from ", cutoff, " (n=", targetIS ,") to next value ", newcut, " for ", desc, "\n", sep="")
            # print(tab)
            cutoff = newcut
            targetIS = sum(cp.indicator >= cutoff)
        } else {
            cat("Warning: Unable to change cutoff value for", desc, "\n")
            # print(tab)
        }
    }
    cat("Cut-off value for ", desc, ": <", cutoff, " to identify ", targetSize, ", getting ", targetIS, " cases (offset ", offset, ")\n", sep="")

    # Do not identify a case, if its indicator == NA
    if.indicator = indicator    
    if.indicator[is.na(indicator)] = -Inf;
    # Sensitivity = TP / CP
    sensA = sum((if.indicator >= cutoff) & (condition == "EG"), na.rm=T) / sum(condition == "EG")
    sensC = sum((cp.indicator >= cutoff) & (cp.condition == "EG")) / sum(cp.condition == "EG")
    # Corrected for over-identification
    oversampling = targetIS / targetSize
    sensAC = sensA / oversampling
    sensCC = sensC / oversampling
    # Specifity
    specA = sum((if.indicator < cutoff) & (condition == "CG"), na.rm=T) / sum(condition == "CG")
    specC = sum((cp.indicator < cutoff) & (cp.condition == "CG")) / sum(cp.condition == "CG")
    specAC = specA / oversampling
    specCC = specC / oversampling
    # LR+
    # Likelihood ratio positive = sensitivity / (1 − specificity)
    lrP = sensAC / (1 - specAC)
    lrPC = sensC / (1 - specC)
    lrPCC = sensCC / (1 - specCC)
    
    # AuC
    # rocData = ROC(cp.indicator, (condition == "EG"), plot=NA)
    auc = AUC(cp.indicator, (cp.condition == "EG"))
    
    return(c(
        "N" = length(condition),
        "EG" = sum(condition == "EG"),
        "unknown" = sum(is.na(indicator)),
        "invalidR2" = sum(!is.na(indicator) & !completes2),
        "valid" = sum(completes),
        "iqr" = iqr,
        "missing" = sum(!completes) / length(indicator),
        "R2" = R2,
        "cut" = cutoff,
        "overspl" = oversampling,
        "sensitive.all" = sensAC,
        "sensitive.cp" = sensC,
        "sensitive.cpc" = sensCC,
        "specific.all" = specAC,
        "specific.cp" = specC,
        "specific.cpc" = specCC,
        "lr.p.all" = lrP,
        "lr.p.cp" = lrPC,
        "lr.p.cpc" = lrPCC,
        "AUC" = auc,
        "desc" = desc
    ))
}

# Compute the area under the curve
AUC = function (testres, truestat) {
  # https://stats.stackexchange.com/questions/145566/how-to-calculate-area-under-the-curve-auc-or-the-c-statistic-by-hand
  # Summary table (Table I in the paper)
  ( tab=as.matrix(table(truestat, testres)) )
  ( tot=colSums(tab) )                            # Number of patients w/ each test result
  ( truepos=unname(rev(cumsum(rev(tab[2,])))) )   # Number of true positives
  ( falsepos=unname(rev(cumsum(rev(tab[1,])))) )  # Number of false positives
  ( totpos=sum(tab[2,]) )                         # The total number of positives (one number)
  ( totneg=sum(tab[1,]) )                         # The total number of negatives (one number)
  (sens=truepos/totpos)                           # Sensitivity (fraction true positives)
  (omspec=falsepos/totneg)                        # 1 − specificity (false positives)
  sens=c(sens,0); omspec=c(omspec,0)              # Numbers when we classify all as normal
  # Plot
  if (F) {
    plot(omspec, sens, type="l", xlim=c(0,1), ylim=c(0,1), lwd=2, xlab="1 − specificity", ylab="Sensitivity") # perhaps with xaxs="i"
    grid()
    abline(0,1, col="red", lty=2)
  }
  # Manually calculating the AUC
  height = (sens[-1]+sens[-length(sens)])/2
  width = -diff(omspec) # = diff(rev(omspec))
  return(sum(height*width))
}