
Sys.setlocale("LC_CTYPE", "german")


library("readxl")
library("SnowballC")
library("tm")
library("stringi")
library("Matrix")
library("e1071")



data_sheet1<-read_excel("Kodierung_LS_Gluck_Intercoder.xls",sheet="First box")
data_sheet2<-read_excel("Kodierung_LS_Gluck_Intercoder.xls",sheet="second box")
data_sheet3<-read_excel("Kodierung_LS_Gluck_Intercoder.xls",sheet="Third box")

names(data_sheet1)<-c("response_id","response","code1","code2","expert","agree","disagree")
names(data_sheet2)<-c("response_id","response","code1","code2","expert","agree","disagree")
names(data_sheet3)<-c("response_id","response","code1","code2","expert","agree","disagree")

data<-rbind(data_sheet1,data_sheet2,data_sheet3)

index.temp<-(data$response=="-66")|(data$response=="-99")
data<-data[!index.temp,]




data$final<-data$code1
data$final[!is.na(data$code2)]<-data$code2[!is.na(data$code2)]
data$final[!is.na(data$expert)]<-data$expert[!is.na(data$expert)]

sum(is.na(data$final))

data<-data[!is.na(data$final),]

label<-sort(unique(c(data$final,data$code1,data$code2)))
lable<-sort(c(1,11,111,112,12,121,122,13,
         2,21,22,3,31,4,5,51,52,53,
         6,7,8,91,92,93,910,920,931,932,933,934,935,936,940,950))

response<-wordStem(data$response,language="german")
response<-removeWords(response,stopwords("german"))
response<-gsub(","," ",response,fixed=TRUE)
response<-gsub("."," ",response,fixed=TRUE)
response<-gsub("!"," ",response,fixed=TRUE)
response<-gsub("?"," ",response,fixed=TRUE)
response<-gsub("  "," ",response,fixed=TRUE)
response<-gsub("  "," ",response,fixed=TRUE)
response<-gsub("  "," ",response,fixed=TRUE)
response<-gsub("  "," ",response,fixed=TRUE)



find_ngrams <- function(dat, n=1, verbose=FALSE) {
  library(pbapply)
  stopifnot(is.list(dat))
  stopifnot(is.numeric(n))
  stopifnot(n>0)
  if(n == 1) return(dat)
  pblapply(dat, function(y) {
    if(length(y)<=1) return(y)
    c(y, unlist(lapply(2:n, function(n_i) {
      if(n_i > length(y)) return(NULL)
      do.call(paste, unname(as.data.frame(embed(rev(y), n_i), stringsAsFactors=FALSE)), quote=FALSE)
    })))
  })
}

text_to_ngrams <- function(sents, n=2){
  tokens <- stri_split_fixed(sents, ' ')
  tokens <- find_ngrams(tokens, n=n, verbose=TRUE)
  token_vector <- unlist(tokens)
  bagofwords <- unique(token_vector)
  n.ids <- sapply(tokens, length)
  i <- rep(seq_along(n.ids), n.ids)
  j <- match(token_vector, bagofwords)
  M <- sparseMatrix(i=i, j=j, x=1L)
  colnames(M) <- bagofwords
  return(M)
}



response.lowercase<-sapply(response,tolower)
names(response.lowercase)<-NULL
text<-text_to_ngrams(response.lowercase,n=2)
text<-text[,colnames(text)!=""]
text<-as.matrix(text)




#--------------------------
#evaluation metrics
#--------------------------



#multi-class log loss of the prediction
#The prob.pred should be a L-column matrix
logloss<-function(prob.pred,true,label) {
  L<-length(label)
  temp<-matrix(0,nrow=dim(prob.pred)[1],ncol=L)
  for (i in c(1:L)) {
    index.temp<-(colnames(prob.pred)==label[i])
    if (sum(index.temp)!=0) {
      temp[,i]<-(true==label[i])*log(prob.pred[,index.temp])
    } else {temp[,i]<-(true==label[i])*log(0.000001)}
  }
  logloss<-apply(temp,1,sum)
  return(mean((-1)*logloss))
}

#--------------------------
#No budget constraint
#--------------------------

#We are going to use 10-folds cross validation to compare the performance of different coding strategies.
#We cannot apply "majority vote" here unfortunately.
#Therefore we compare the following strategies:
#1)single coding by the first coder,
#2)single coding by the second coder,
#3)replicate,
#4)remove differences,
#and 5)expert resolves.

comparison.nobudget<-function(data.train,data.test,text.train,text.test,svm.cost,label) {
  
  #single coding by the 1st coder
  y.train.sc1<-as.factor(data.train$code1)
  data.train.sc1<-data.frame(y0=y.train.sc1,text.train)
  #data.train.sc1<-data.train.sc1[!is.na(data.train.sc1$y),]
  #presence.word.sc1<-apply(data.train.sc1[,-1],2,sum)

  
  #single coding by the 2nd coder
  y.train.sc2<-as.factor(data.train$code2)
  data.train.sc2<-data.frame(y0=y.train.sc2,text.train)
  
  #replicate
  y.train.rep<-as.factor(c(data.train$code1,data.train$code2))
  data.train.rep<-data.frame(y0=y.train.rep,rbind(text.train,text.train))
  
  #remove differences
  y.train.remove<-as.factor(data.train$final[data.train$agree==1])
  data.train.remove<-data.frame(y0=y.train.remove,text.train[data.train$agree==1,])
  
  #expert resolves
  y.train.expert<-as.factor(data.train$final)
  data.train.expert<-data.frame(y0=y.train.expert,text.train)
  
  #model fitting
  svm.sc1<-svm(y0~.,data=data.train.sc1,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.sc1<-predict(svm.sc1,newdata=data.frame(text.test),probability=TRUE)
  predict.test.sc1<-attr(predict.test.sc1, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.sc1)))
  predict.test.sc1<-predict.test.sc1[,order.label]
  response.test.sc1<-predict(svm.sc1,newdata=data.frame(text.test))
  
  svm.sc2<-svm(y0~.,data=data.train.sc2,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.sc2<-predict(svm.sc2,newdata=data.frame(text.test),probability=TRUE)
  predict.test.sc2<-attr(predict.test.sc2, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.sc2)))
  predict.test.sc2<-predict.test.sc2[,order.label]
  response.test.sc2<-predict(svm.sc2,newdata=data.frame(text.test))  
  
  svm.rep<-svm(y0~.,data=data.train.rep,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.rep<-predict(svm.rep,newdata=data.frame(text.test),probability=TRUE)
  predict.test.rep<-attr(predict.test.rep, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.rep)))
  predict.test.rep<-predict.test.rep[,order.label]
  response.test.rep<-predict(svm.rep,newdata=data.frame(text.test))  
  
  svm.remove<-svm(y0~.,data=data.train.remove,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.remove<-predict(svm.remove,newdata=data.frame(text.test),probability=TRUE)
  predict.test.remove<-attr(predict.test.remove, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.remove)))
  predict.test.remove<-predict.test.remove[,order.label]
  response.test.remove<-predict(svm.remove,newdata=data.frame(text.test))  
  
  svm.expert<-svm(y0~.,data=data.train.expert,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.expert<-predict(svm.expert,newdata=data.frame(text.test),probability=TRUE)
  predict.test.expert<-attr(predict.test.expert, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.expert)))
  predict.test.expert<-predict.test.expert[,order.label]
  response.test.expert<-predict(svm.expert,newdata=data.frame(text.test))  
  
  accuracy.sc1<-mean(response.test.sc1==data.test$final)
  logloss.sc1<-logloss(predict.test.sc1,data.test$final,label)
  
  accuracy.sc2<-mean(response.test.sc2==data.test$final)
  logloss.sc2<-logloss(predict.test.sc2,data.test$final,label)
  
  accuracy.rep<-mean(response.test.rep==data.test$final)
  logloss.rep<-logloss(predict.test.rep,data.test$final,label)
  
  accuracy.remove<-mean(response.test.remove==data.test$final)
  logloss.remove<-logloss(predict.test.remove,data.test$final,label)
  
  accuracy.expert<-mean(response.test.expert==data.test$final)
  logloss.expert<-logloss(predict.test.expert,data.test$final,label)
  
  return(c(accuracy.sc1,accuracy.sc2,accuracy.rep,accuracy.remove,accuracy.expert,
           logloss.sc1,logloss.sc2,logloss.rep,logloss.remove,logloss.expert))
  
}


#10-folds cross-validation

cv.test.nobudget<-function(data,text,svm.cost,label,fold=10,random.seed) {
  n<-dim(data)[1]
  n.test<-round(n/fold)
  
  set.seed(random.seed)
  
  #n-fold cross-validation requires random partition
  index.temp<-sample(c(1:n),size=n,replace=FALSE)
  data<-data[index.temp,]
  text<-text[index.temp,]
  
  accuracy.sc1.by.fold<-rep(0,time=fold)
  logloss.sc1.by.fold<-rep(0,time=fold)
  
  accuracy.sc2.by.fold<-rep(0,time=fold)
  logloss.sc2.by.fold<-rep(0,time=fold)
  
  accuracy.rep.by.fold<-rep(0,time=fold)
  logloss.rep.by.fold<-rep(0,time=fold)
  
  accuracy.remove.by.fold<-rep(0,time=fold)
  logloss.remove.by.fold<-rep(0,time=fold)
  
  accuracy.expert.by.fold<-rep(0,time=fold)
  logloss.expert.by.fold<-rep(0,time=fold)
  
  for (k in c(1:fold)) {
    print(k)
    index.test<-c(((k-1)*n.test+1):(k*n.test))
    
    if (k==fold) {index.test<-c(((k-1)*n.test+1):n)}
    
    
    data.train<-data[-index.test,]
    data.test<-data[index.test,]
    
    text.train<-text[-index.test,]
    text.test<-text[index.test,]
    
    result.onefold<-comparison.nobudget(data.train,data.test,text.train,text.test,svm.cost,label)
    
    accuracy.sc1.by.fold[k]<-result.onefold[1]
    accuracy.sc2.by.fold[k]<-result.onefold[2]
    accuracy.rep.by.fold[k]<-result.onefold[3]
    accuracy.remove.by.fold[k]<-result.onefold[4]
    accuracy.expert.by.fold[k]<-result.onefold[5]
    
    logloss.sc1.by.fold[k]<-result.onefold[6]
    logloss.sc2.by.fold[k]<-result.onefold[7]
    logloss.rep.by.fold[k]<-result.onefold[8]
    logloss.remove.by.fold[k]<-result.onefold[9]
    logloss.expert.by.fold[k]<-result.onefold[10]
  }
    
  
    return(c(mean(accuracy.sc1.by.fold),
             mean(accuracy.sc2.by.fold),
             mean(accuracy.rep.by.fold),
             mean(accuracy.remove.by.fold),
             mean(accuracy.expert.by.fold),
             mean(logloss.sc1.by.fold),
             mean(logloss.sc2.by.fold),
             mean(logloss.rep.by.fold),
             mean(logloss.remove.by.fold),
             mean(logloss.expert.by.fold)
             ))
  
}


cv.tuning.nobudget<-function(data,text,cost.range,label,fold=10,random.seed) {
  accuracy.sc1.by.fold<-rep(0,length=length(cost.range))
  accuracy.sc2.by.fold<-rep(0,length=length(cost.range))
  accuracy.rep.by.fold<-rep(0,length=length(cost.range))
  accuracy.remove.by.fold<-rep(0,length=length(cost.range))
  accuracy.expert.by.fold<-rep(0,length=length(cost.range))
  
  logloss.sc1.by.fold<-rep(0,length=length(cost.range))
  logloss.sc2.by.fold<-rep(0,length=length(cost.range))
  logloss.rep.by.fold<-rep(0,length=length(cost.range))
  logloss.remove.by.fold<-rep(0,length=length(cost.range))
  logloss.expert.by.fold<-rep(0,length=length(cost.range))
  
  for (i in c(1:length(cost.range))) {
    
    print(i)
    
    result<-cv.test.nobudget(data,text,svm.cost=cost.range[i],label,fold=10,random.seed) 
      
    accuracy.sc1.by.fold[i]<-result[1]
    accuracy.sc2.by.fold[i]<-result[2]
    accuracy.rep.by.fold[i]<-result[3]
    accuracy.remove.by.fold[i]<-result[4]
    accuracy.expert.by.fold[i]<-result[5]
    
    logloss.sc1.by.fold[i]<-result[6]
    logloss.sc2.by.fold[i]<-result[7]
    logloss.rep.by.fold[i]<-result[8]
    logloss.remove.by.fold[i]<-result[9]
    logloss.expert.by.fold[i]<-result[10]
    
  }
  
  return(data.frame(accuracy.sc1.by.fold,accuracy.sc2.by.fold,
                    accuracy.rep.by.fold,accuracy.remove.by.fold,
                    accuracy.expert.by.fold,
                    logloss.sc1.by.fold,logloss.sc2.by.fold,
                    logloss.rep.by.fold,logloss.remove.by.fold,
                    logloss.expert.by.fold
                    ))
}


set.seed(66)

cost.range<-c(1,10,100,500,1000,2000,3000,5000,7000,10000,12000,15000,20000)
output.cv.nobudget<-cv.tuning.nobudget(data,text,cost.range,label,fold=10,random.seed=66) 
  
cv.cost<-cost.range[apply(output.cv.nobudget[,1:5],2,which.max)]

#cv.cost<-cost.range[apply(output.cv.nobudget[,6:10],2,which.min)]

svm.cost.sc1<-cv.cost[1]
svm.cost.sc2<-cv.cost[2]
svm.cost.rep<-cv.cost[3]
svm.cost.remove<-cv.cost[4]
svm.cost.expert<-cv.cost[5]



#10-folds cross-validation with different svm.cost


comparison.nobudget.difcost<-function(data.train,data.test,text.train,text.test,label,
                                      svm.cost.sc1,svm.cost.sc2,svm.cost.rep,
                                      svm.cost.remove,svm.cost.expert) {
  
  #single coding by the 1st coder
  y.train.sc1<-as.factor(data.train$code1)
  data.train.sc1<-data.frame(y0=y.train.sc1,text.train)
  
  #single coding by the 2nd coder
  y.train.sc2<-as.factor(data.train$code2)
  data.train.sc2<-data.frame(y0=y.train.sc2,text.train)
  
  #replicate
  y.train.rep<-as.factor(c(data.train$code1,data.train$code2))
  data.train.rep<-data.frame(y0=y.train.rep,rbind(text.train,text.train))
  
  #remove differences
  y.train.remove<-as.factor(data.train$final[data.train$agree==1])
  data.train.remove<-data.frame(y0=y.train.remove,text.train[data.train$agree==1,])
  
  #expert resolves
  y.train.expert<-as.factor(data.train$final)
  data.train.expert<-data.frame(y0=y.train.expert,text.train)
  
  #model fitting
  svm.sc1<-svm(y0~.,data=data.train.sc1,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.sc1)
  predict.test.sc1<-predict(svm.sc1,newdata=data.frame(text.test),probability=TRUE)
  predict.test.sc1<-attr(predict.test.sc1, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.sc1)))
  predict.test.sc1<-predict.test.sc1[,order.label]
  response.test.sc1<-predict(svm.sc1,newdata=data.frame(text.test))
  
  svm.sc2<-svm(y0~.,data=data.train.sc2,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.sc2)
  predict.test.sc2<-predict(svm.sc2,newdata=data.frame(text.test),probability=TRUE)
  predict.test.sc2<-attr(predict.test.sc2, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.sc2)))
  predict.test.sc2<-predict.test.sc2[,order.label]
  response.test.sc2<-predict(svm.sc2,newdata=data.frame(text.test))  
  
  svm.rep<-svm(y0~.,data=data.train.rep,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.rep)
  predict.test.rep<-predict(svm.rep,newdata=data.frame(text.test),probability=TRUE)
  predict.test.rep<-attr(predict.test.rep, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.rep)))
  predict.test.rep<-predict.test.rep[,order.label]
  response.test.rep<-predict(svm.rep,newdata=data.frame(text.test))  
  
  svm.remove<-svm(y0~.,data=data.train.remove,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.remove)
  predict.test.remove<-predict(svm.remove,newdata=data.frame(text.test),probability=TRUE)
  predict.test.remove<-attr(predict.test.remove, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.remove)))
  predict.test.remove<-predict.test.remove[,order.label]
  response.test.remove<-predict(svm.remove,newdata=data.frame(text.test))  
  
  svm.expert<-svm(y0~.,data=data.train.expert,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.expert)
  predict.test.expert<-predict(svm.expert,newdata=data.frame(text.test),probability=TRUE)
  predict.test.expert<-attr(predict.test.expert, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.expert)))
  predict.test.expert<-predict.test.expert[,order.label]
  response.test.expert<-predict(svm.expert,newdata=data.frame(text.test))  
  
  accuracy.sc1<-mean(response.test.sc1==data.test$final)
  logloss.sc1<-logloss(predict.test.sc1,data.test$final,label)
  
  accuracy.sc2<-mean(response.test.sc2==data.test$final)
  logloss.sc2<-logloss(predict.test.sc2,data.test$final,label)
  
  accuracy.rep<-mean(response.test.rep==data.test$final)
  logloss.rep<-logloss(predict.test.rep,data.test$final,label)
  
  accuracy.remove<-mean(response.test.remove==data.test$final)
  logloss.remove<-logloss(predict.test.remove,data.test$final,label)
  
  accuracy.expert<-mean(response.test.expert==data.test$final)
  logloss.expert<-logloss(predict.test.expert,data.test$final,label)
  
  return(c(accuracy.sc1,accuracy.sc2,accuracy.rep,accuracy.remove,accuracy.expert,
           logloss.sc1,logloss.sc2,logloss.rep,logloss.remove,logloss.expert))
  
}



cv.test.nobudget.difcost<-function(data,text,label,fold=10,
                                   svm.cost.sc1,svm.cost.sc2,svm.cost.rep,
                                   svm.cost.remove,svm.cost.expert) {
  n<-dim(data)[1]
  n.test<-round(n/fold)
  

  #n-fold cross-validation requires random partition
  index.temp<-sample(c(1:n),size=n,replace=FALSE)
  data<-data[index.temp,]
  text<-text[index.temp,]
  
  accuracy.sc1.by.fold<-rep(0,time=fold)
  logloss.sc1.by.fold<-rep(0,time=fold)
  
  accuracy.sc2.by.fold<-rep(0,time=fold)
  logloss.sc2.by.fold<-rep(0,time=fold)
  
  accuracy.rep.by.fold<-rep(0,time=fold)
  logloss.rep.by.fold<-rep(0,time=fold)
  
  accuracy.remove.by.fold<-rep(0,time=fold)
  logloss.remove.by.fold<-rep(0,time=fold)
  
  accuracy.expert.by.fold<-rep(0,time=fold)
  logloss.expert.by.fold<-rep(0,time=fold)
  
  for (k in c(1:fold)) {
    print(k)
    index.test<-c(((k-1)*n.test+1):(k*n.test))
    
    if (k==fold) {index.test<-c(((k-1)*n.test+1):n)}
    
    
    data.train<-data[-index.test,]
    data.test<-data[index.test,]
    
    text.train<-text[-index.test,]
    text.test<-text[index.test,]
    
    result.onefold<-comparison.nobudget.difcost(data.train,data.test,text.train,text.test,label,
                                        svm.cost.sc1,svm.cost.sc2,svm.cost.rep,
                                        svm.cost.remove,svm.cost.expert)
    
    accuracy.sc1.by.fold[k]<-result.onefold[1]
    accuracy.sc2.by.fold[k]<-result.onefold[2]
    accuracy.rep.by.fold[k]<-result.onefold[3]
    accuracy.remove.by.fold[k]<-result.onefold[4]
    accuracy.expert.by.fold[k]<-result.onefold[5]
    
    logloss.sc1.by.fold[k]<-result.onefold[6]
    logloss.sc2.by.fold[k]<-result.onefold[7]
    logloss.rep.by.fold[k]<-result.onefold[8]
    logloss.remove.by.fold[k]<-result.onefold[9]
    logloss.expert.by.fold[k]<-result.onefold[10]
  }
  
  
  return(c(mean(accuracy.sc1.by.fold),
           mean(accuracy.sc2.by.fold),
           mean(accuracy.rep.by.fold),
           mean(accuracy.remove.by.fold),
           mean(accuracy.expert.by.fold),
           mean(logloss.sc1.by.fold),
           mean(logloss.sc2.by.fold),
           mean(logloss.rep.by.fold),
           mean(logloss.remove.by.fold),
           mean(logloss.expert.by.fold)
  ))
  
}



n.repeat<-100

output.nobudget.difcost<-matrix(0,nrow=n.repeat,ncol=10)

set.seed(66)
for (l in c(1:n.repeat)) {
  print(l)
  output.nobudget.difcost[l,]<-cv.test.nobudget.difcost(data,text,label,fold=10,
                           svm.cost.sc1,svm.cost.sc2,svm.cost.rep,
                           svm.cost.remove,svm.cost.expert)    
}



boxplot(output.nobudget.difcost[,1:5])
boxplot(output.nobudget.difcost[,6:10])


accuracy.nobudget.difcost<-output.nobudget.difcost[,1:5]
colnames(accuracy.nobudget.difcost)<-c("Single-code 1","Single-code 2","Replicate","Remove differences","Expert resolves")

logloss.nobudget.difcost<-output.nobudget.difcost[,6:10]
colnames(logloss.nobudget.difcost)<-c("Single-code 1","Single-code 2","Replicate","Remove differences","Expert resolves")


boxplot(accuracy.nobudget.difcost,main="Accuracy without considering budget")
boxplot(logloss.nobudget.difcost,main="Logloss without considering budget")


#Some statistical test to see whether the differences in accuracy is significant
#We borrow the idea of bootstrap

n.bootstrap<-10000


set.seed(255)

SC1.boot<-accuracy.nobudget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),1]
SC2.boot<-accuracy.nobudget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),2]
Rep.boot<-accuracy.nobudget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),3]
Remove.boot<-accuracy.nobudget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),4]
Expert.boot<-accuracy.nobudget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),5]

SC1overRep<-SC1.boot>=Rep.boot
SC2overRep<-SC2.boot>=Rep.boot
SC1overRemove<-SC1.boot>=Remove.boot
SC2overRemove<-SC2.boot>=Remove.boot
SC1overExpert<-SC1.boot>=Expert.boot
SC2overExpert<-SC2.boot>=Expert.boot

RepoverRemove<-Rep.boot>=Remove.boot
RepoverExpert<-Rep.boot>=Expert.boot
RemoveoverExpert<-Remove.boot>=Expert.boot


mean(SC1overRep)
#0.0511
mean(SC2overRep)
#0.0089
mean(SC1overRemove)
#0.4428
mean(SC2overRemove)
#0.1901
mean(SC1overExpert)
#0.0432
mean(SC2overExpert)
#0.0069

mean(RepoverRemove)
#0.9148
mean(RepoverExpert)
#0.476
mean(RemoveoverExpert)
#0.0732


#--------------------------
#With budget constraint
#--------------------------


#We are going to use 10-folds cross validation to compare the performance of different coding strategies.
#We cannot apply "majority vote" here unfortunately.
#Therefore we compare the following strategies:
#1)single coding by the first coder,
#2)replicate,
#3)remove differences,
#and 4)expert resolves.



#Suppose we use 100 (out of 1479) observations in double-coded subset to estimate the error rate.
n.temp<-100
set.seed(88)
index.temp<-sample(c(1:length(data$final)),size=n.temp,replace=FALSE)
error.rate<-c(data$code1[index.temp],data$code2[index.temp])!=c(data$final[index.temp],data$final[index.temp])
error.rate<-mean(error.rate[!is.na(error.rate)])


n.sc<-1200
t<-10
n.rep<-round(n.sc/2)
n.remove<-round(n.sc/2)
n.expert<-round(n.sc/(2+2*t*error.rate-(length(label)/(length(label)-1))*t*(error.rate^2)))


comparison.budget<-function(data.train,data.test,text.train,text.test,
                            n.sc,n.rep,
                            n.remove,n.expert,svm.cost,label) {
  
  
  index.sc1<-sample(c(1:(dim(data.train)[1])),size=n.sc,replace=FALSE)
  y.train.sc1<-as.factor(data.train$code1[index.sc1])
  data.train.sc1<-data.frame(y0=y.train.sc1,text.train[index.sc1,])
  
  index.sc2<-sample(c(1:(dim(data.train)[1])),size=n.sc,replace=FALSE)
  y.train.sc2<-as.factor(data.train$code2[index.sc2])
  data.train.sc2<-data.frame(y0=y.train.sc2,text.train[index.sc2,])
  
  #replicate
  index.rep<-sample(c(1:(dim(data.train)[1])),size=n.rep,replace=FALSE)
  y.train.rep<-as.factor(c(data.train$code1[index.rep],data.train$code2[index.rep]))
  data.train.rep<-data.frame(y0=y.train.rep,rbind(text.train[index.rep,],text.train[index.rep,]))
  
  #remove differences
  index.remove<-sample(c(1:(dim(data.train)[1])),size=n.remove,replace=FALSE)
  y.train.remove<-data.train$final[index.remove]
  y.train.remove<-as.factor(y.train.remove[data.train$agree[index.remove]==1])
  data.train.remove<-data.frame(y0=y.train.remove,(text.train[index.remove,])[data.train$agree[index.remove]==1,])
  
  #expert resolves
  index.expert<-sample(c(1:(dim(data.train)[1])),size=n.expert,replace=FALSE)
  y.train.expert<-as.factor(data.train$final[index.expert])
  data.train.expert<-data.frame(y0=y.train.expert,text.train[index.expert,])
  
  #model fitting
  svm.sc1<-svm(y0~.,data=data.train.sc1,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.sc1<-predict(svm.sc1,newdata=data.frame(text.test),probability=TRUE)
  predict.test.sc1<-attr(predict.test.sc1, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.sc1)))
  predict.test.sc1<-predict.test.sc1[,order.label]
  response.test.sc1<-predict(svm.sc1,newdata=data.frame(text.test))
  
  svm.sc2<-svm(y0~.,data=data.train.sc2,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.sc2<-predict(svm.sc2,newdata=data.frame(text.test),probability=TRUE)
  predict.test.sc2<-attr(predict.test.sc2, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.sc2)))
  predict.test.sc2<-predict.test.sc2[,order.label]
  response.test.sc2<-predict(svm.sc2,newdata=data.frame(text.test))
  
  svm.rep<-svm(y0~.,data=data.train.rep,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.rep<-predict(svm.rep,newdata=data.frame(text.test),probability=TRUE)
  predict.test.rep<-attr(predict.test.rep, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.rep)))
  predict.test.rep<-predict.test.rep[,order.label]
  response.test.rep<-predict(svm.rep,newdata=data.frame(text.test))  
  
  svm.remove<-svm(y0~.,data=data.train.remove,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.remove<-predict(svm.remove,newdata=data.frame(text.test),probability=TRUE)
  predict.test.remove<-attr(predict.test.remove, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.remove)))
  predict.test.remove<-predict.test.remove[,order.label]
  response.test.remove<-predict(svm.remove,newdata=data.frame(text.test))  
  
  svm.expert<-svm(y0~.,data=data.train.expert,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost)
  predict.test.expert<-predict(svm.expert,newdata=data.frame(text.test),probability=TRUE)
  predict.test.expert<-attr(predict.test.expert, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.expert)))
  predict.test.expert<-predict.test.expert[,order.label]
  response.test.expert<-predict(svm.expert,newdata=data.frame(text.test))  
  
  accuracy.sc1<-mean(response.test.sc1==data.test$final)
  logloss.sc1<-logloss(predict.test.sc1,data.test$final,label)
  
  accuracy.sc2<-mean(response.test.sc2==data.test$final)
  logloss.sc2<-logloss(predict.test.sc2,data.test$final,label)
  
  accuracy.rep<-mean(response.test.rep==data.test$final)
  logloss.rep<-logloss(predict.test.rep,data.test$final,label)
  
  accuracy.remove<-mean(response.test.remove==data.test$final)
  logloss.remove<-logloss(predict.test.remove,data.test$final,label)
  
  accuracy.expert<-mean(response.test.expert==data.test$final)
  logloss.expert<-logloss(predict.test.expert,data.test$final,label)
  
  return(c(accuracy.sc1,accuracy.sc2,accuracy.rep,accuracy.remove,accuracy.expert,
           logloss.sc1,logloss.sc2,logloss.rep,logloss.remove,logloss.expert))
  
}



#10-folds cross-validation for tuning parameter

cv.test.budget<-function(data,text,
                         n.sc,n.rep,n.remove,n.expert,
                         svm.cost,label,fold=10,random.seed) {
  n<-dim(data)[1]
  n.test<-round(n/fold)
  
  set.seed(random.seed)
  
  #n-fold cross-validation requires random partition
  index.temp<-sample(c(1:n),size=n,replace=FALSE)
  data<-data[index.temp,]
  text<-text[index.temp,]
  
  accuracy.sc1.by.fold<-rep(0,time=fold)
  logloss.sc1.by.fold<-rep(0,time=fold)
  
  accuracy.sc2.by.fold<-rep(0,time=fold)
  logloss.sc2.by.fold<-rep(0,time=fold)
  
  accuracy.rep.by.fold<-rep(0,time=fold)
  logloss.rep.by.fold<-rep(0,time=fold)
  
  accuracy.remove.by.fold<-rep(0,time=fold)
  logloss.remove.by.fold<-rep(0,time=fold)
  
  accuracy.expert.by.fold<-rep(0,time=fold)
  logloss.expert.by.fold<-rep(0,time=fold)
  
  for (k in c(1:fold)) {
    print(k)
    index.test<-c(((k-1)*n.test+1):(k*n.test))
    
    if (k==fold) {index.test<-c(((k-1)*n.test+1):n)}
    
    
    data.train<-data[-index.test,]
    data.test<-data[index.test,]
    
    text.train<-text[-index.test,]
    text.test<-text[index.test,]
    
    result.onefold<-comparison.budget(data.train,data.test,text.train,text.test,
                                      n.sc,n.rep,
                                      n.remove,n.expert,svm.cost,label)
    
    accuracy.sc1.by.fold[k]<-result.onefold[1]
    accuracy.sc2.by.fold[k]<-result.onefold[2]
    accuracy.rep.by.fold[k]<-result.onefold[3]
    accuracy.remove.by.fold[k]<-result.onefold[4]
    accuracy.expert.by.fold[k]<-result.onefold[5]
    
    logloss.sc1.by.fold[k]<-result.onefold[6]
    logloss.sc2.by.fold[k]<-result.onefold[7]
    logloss.rep.by.fold[k]<-result.onefold[8]
    logloss.remove.by.fold[k]<-result.onefold[9]
    logloss.expert.by.fold[k]<-result.onefold[10]
  }
  
  return(c(mean(accuracy.sc1.by.fold),
                    mean(accuracy.sc2.by.fold),
                    mean(accuracy.rep.by.fold),
                    mean(accuracy.remove.by.fold),
                    mean(accuracy.expert.by.fold),
                    mean(logloss.sc1.by.fold),
                    mean(logloss.sc2.by.fold),
                    mean(logloss.rep.by.fold),
                    mean(logloss.remove.by.fold),
                    mean(logloss.expert.by.fold)))
  
}



cv.tuning.budget<-function(data,text,n.sc,n.rep,n.remove,n.expert,cost.range,label,fold=10,random.seed) {
  accuracy.sc1.by.fold<-rep(0,length=length(cost.range))
  accuracy.sc2.by.fold<-rep(0,length=length(cost.range))
  accuracy.rep.by.fold<-rep(0,length=length(cost.range))
  accuracy.remove.by.fold<-rep(0,length=length(cost.range))
  accuracy.expert.by.fold<-rep(0,length=length(cost.range))
  
  logloss.sc1.by.fold<-rep(0,length=length(cost.range))
  logloss.sc2.by.fold<-rep(0,length=length(cost.range))
  logloss.rep.by.fold<-rep(0,length=length(cost.range))
  logloss.remove.by.fold<-rep(0,length=length(cost.range))
  logloss.expert.by.fold<-rep(0,length=length(cost.range))
  
  for (i in c(1:length(cost.range))) {
    
    print(i)
    
    result<-cv.test.budget(data,text,
                             n.sc,n.rep,n.remove,n.expert,
                             svm.cost=cost.range[i],label,fold=10,random.seed)
    
    accuracy.sc1.by.fold[i]<-result[1]
    accuracy.sc2.by.fold[i]<-result[2]
    accuracy.rep.by.fold[i]<-result[3]
    accuracy.remove.by.fold[i]<-result[4]
    accuracy.expert.by.fold[i]<-result[5]
    
    logloss.sc1.by.fold[i]<-result[6]
    logloss.sc2.by.fold[i]<-result[7]
    logloss.rep.by.fold[i]<-result[8]
    logloss.remove.by.fold[i]<-result[9]
    logloss.expert.by.fold[i]<-result[10]
    
  }
  
  return(data.frame(accuracy.sc1.by.fold,accuracy.sc2.by.fold,
                    accuracy.rep.by.fold,accuracy.remove.by.fold,
                    accuracy.expert.by.fold,
                    logloss.sc1.by.fold,logloss.sc2.by.fold,
                    logloss.rep.by.fold,logloss.remove.by.fold,
                    logloss.expert.by.fold
  ))
}




set.seed(66)

cost.range<-c(1,10,100,500,1000,2000,3000,5000,7000,10000,12000,15000,20000)
output.cv.budget<-cv.tuning.budget(data,text,n.sc,n.rep,n.remove,n.expert,cost.range,label,fold=10,random.seed=66) 


cv.cost<-cost.range[apply(output.cv.budget[,1:5],2,which.max)]

#cv.cost<-cost.range[apply(output.cv.nobudget[,6:10],2,which.min)]

svm.cost.sc1<-cv.cost[1]
svm.cost.sc2<-cv.cost[2]
svm.cost.rep<-cv.cost[3]
svm.cost.remove<-cv.cost[4]
svm.cost.expert<-cv.cost[5]


#Using the tuning parameter, we conduct multiple 10-folds cv.

comparison.budget.difcost<-function(data.train,data.test,text.train,text.test,
                            n.sc,n.rep,n.remove,n.expert,label,
                            svm.cost.sc1,svm.cost.sc2,svm.cost.rep,
                            svm.cost.remove,svm.cost.expert) {
  
  
  index.sc1<-sample(c(1:(dim(data.train)[1])),size=n.sc,replace=FALSE)
  y.train.sc1<-as.factor(data.train$code1[index.sc1])
  data.train.sc1<-data.frame(y0=y.train.sc1,text.train[index.sc1,])
  
  index.sc2<-sample(c(1:(dim(data.train)[1])),size=n.sc,replace=FALSE)
  y.train.sc2<-as.factor(data.train$code2[index.sc2])
  data.train.sc2<-data.frame(y0=y.train.sc2,text.train[index.sc2,])
  
  #replicate
  index.rep<-sample(c(1:(dim(data.train)[1])),size=n.rep,replace=FALSE)
  y.train.rep<-as.factor(c(data.train$code1[index.rep],data.train$code2[index.rep]))
  data.train.rep<-data.frame(y0=y.train.rep,rbind(text.train[index.rep,],text.train[index.rep,]))
  
  #remove differences
  index.remove<-sample(c(1:(dim(data.train)[1])),size=n.remove,replace=FALSE)
  y.train.remove<-data.train$final[index.remove]
  y.train.remove<-as.factor(y.train.remove[data.train$agree[index.remove]==1])
  data.train.remove<-data.frame(y0=y.train.remove,(text.train[index.remove,])[data.train$agree[index.remove]==1,])
  
  #expert resolves
  index.expert<-sample(c(1:(dim(data.train)[1])),size=n.expert,replace=FALSE)
  y.train.expert<-as.factor(data.train$final[index.expert])
  data.train.expert<-data.frame(y0=y.train.expert,text.train[index.expert,])
  
  #model fitting
  svm.sc1<-svm(y0~.,data=data.train.sc1,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.sc1)
  predict.test.sc1<-predict(svm.sc1,newdata=data.frame(text.test),probability=TRUE)
  predict.test.sc1<-attr(predict.test.sc1, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.sc1)))
  predict.test.sc1<-predict.test.sc1[,order.label]
  response.test.sc1<-predict(svm.sc1,newdata=data.frame(text.test))
  
  svm.sc2<-svm(y0~.,data=data.train.sc2,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.sc2)
  predict.test.sc2<-predict(svm.sc2,newdata=data.frame(text.test),probability=TRUE)
  predict.test.sc2<-attr(predict.test.sc2, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.sc2)))
  predict.test.sc2<-predict.test.sc2[,order.label]
  response.test.sc2<-predict(svm.sc2,newdata=data.frame(text.test))
  
  svm.rep<-svm(y0~.,data=data.train.rep,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.rep)
  predict.test.rep<-predict(svm.rep,newdata=data.frame(text.test),probability=TRUE)
  predict.test.rep<-attr(predict.test.rep, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.rep)))
  predict.test.rep<-predict.test.rep[,order.label]
  response.test.rep<-predict(svm.rep,newdata=data.frame(text.test))  
  
  svm.remove<-svm(y0~.,data=data.train.remove,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.remove)
  predict.test.remove<-predict(svm.remove,newdata=data.frame(text.test),probability=TRUE)
  predict.test.remove<-attr(predict.test.remove, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.remove)))
  predict.test.remove<-predict.test.remove[,order.label]
  response.test.remove<-predict(svm.remove,newdata=data.frame(text.test))  
  
  svm.expert<-svm(y0~.,data=data.train.expert,scale=FALSE,kernal="linear",probability=TRUE,cost=svm.cost.expert)
  predict.test.expert<-predict(svm.expert,newdata=data.frame(text.test),probability=TRUE)
  predict.test.expert<-attr(predict.test.expert, "probabilities")
  order.label<-order(as.numeric(colnames(predict.test.expert)))
  predict.test.expert<-predict.test.expert[,order.label]
  response.test.expert<-predict(svm.expert,newdata=data.frame(text.test))  
  
  accuracy.sc1<-mean(response.test.sc1==data.test$final)
  logloss.sc1<-logloss(predict.test.sc1,data.test$final,label)
  
  accuracy.sc2<-mean(response.test.sc2==data.test$final)
  logloss.sc2<-logloss(predict.test.sc2,data.test$final,label)
  
  accuracy.rep<-mean(response.test.rep==data.test$final)
  logloss.rep<-logloss(predict.test.rep,data.test$final,label)
  
  accuracy.remove<-mean(response.test.remove==data.test$final)
  logloss.remove<-logloss(predict.test.remove,data.test$final,label)
  
  accuracy.expert<-mean(response.test.expert==data.test$final)
  logloss.expert<-logloss(predict.test.expert,data.test$final,label)
  
  return(c(accuracy.sc1,accuracy.sc2,accuracy.rep,accuracy.remove,accuracy.expert,
           logloss.sc1,logloss.sc2,logloss.rep,logloss.remove,logloss.expert))
  
}



#10-folds cross-validation for tuning parameter


cv.test.budget.difcost<-function(data,text,n.sc,n.rep,n.remove,n.expert,
                         label,fold=10,
                         svm.cost.sc1,svm.cost.sc2,svm.cost.rep,
                         svm.cost.remove,svm.cost.expert) {
  n<-dim(data)[1]
  n.test<-round(n/fold)
  
  #n-fold cross-validation requires random partition
  index.temp<-sample(c(1:n),size=n,replace=FALSE)
  data<-data[index.temp,]
  text<-text[index.temp,]
  
  accuracy.sc1.by.fold<-rep(0,time=fold)
  logloss.sc1.by.fold<-rep(0,time=fold)
  
  accuracy.sc2.by.fold<-rep(0,time=fold)
  logloss.sc2.by.fold<-rep(0,time=fold)
  
  accuracy.rep.by.fold<-rep(0,time=fold)
  logloss.rep.by.fold<-rep(0,time=fold)
  
  accuracy.remove.by.fold<-rep(0,time=fold)
  logloss.remove.by.fold<-rep(0,time=fold)
  
  accuracy.expert.by.fold<-rep(0,time=fold)
  logloss.expert.by.fold<-rep(0,time=fold)
  
  for (k in c(1:fold)) {
    print(k)
    index.test<-c(((k-1)*n.test+1):(k*n.test))
    
    if (k==fold) {index.test<-c(((k-1)*n.test+1):n)}
    
    
    data.train<-data[-index.test,]
    data.test<-data[index.test,]
    
    text.train<-text[-index.test,]
    text.test<-text[index.test,]
    
    result.onefold<-comparison.budget.difcost(data.train,data.test,text.train,text.test,
                                      n.sc,n.rep,n.remove,n.expert,label,
                                      svm.cost.sc1,svm.cost.sc2,svm.cost.rep,
                                      svm.cost.remove,svm.cost.expert)
    
    accuracy.sc1.by.fold[k]<-result.onefold[1]
    accuracy.sc2.by.fold[k]<-result.onefold[2]
    accuracy.rep.by.fold[k]<-result.onefold[3]
    accuracy.remove.by.fold[k]<-result.onefold[4]
    accuracy.expert.by.fold[k]<-result.onefold[5]
    
    logloss.sc1.by.fold[k]<-result.onefold[6]
    logloss.sc2.by.fold[k]<-result.onefold[7]
    logloss.rep.by.fold[k]<-result.onefold[8]
    logloss.remove.by.fold[k]<-result.onefold[9]
    logloss.expert.by.fold[k]<-result.onefold[10]
  }
  
  return(c(mean(accuracy.sc1.by.fold),
                    mean(accuracy.sc2.by.fold),
                    mean(accuracy.rep.by.fold),
                    mean(accuracy.remove.by.fold),
                    mean(accuracy.expert.by.fold),
                    mean(logloss.sc1.by.fold),
                    mean(logloss.sc2.by.fold),
                    mean(logloss.rep.by.fold),
                    mean(logloss.remove.by.fold),
                    mean(logloss.expert.by.fold)))
  
}



n.repeat<-100

output.budget.difcost<-matrix(0,nrow=n.repeat,ncol=10)

set.seed(66)
for (l in c(1:n.repeat)) {
  print(l)
  output.budget.difcost[l,]<-cv.test.budget.difcost(data,text,n.sc,n.rep,n.remove,n.expert,
                                                              label,fold=10,
                                                              svm.cost.sc1,svm.cost.sc2,svm.cost.rep,
                                                              svm.cost.remove,svm.cost.expert)   
}



boxplot(output.budget.difcost[,1:5])
boxplot(output.budget.difcost[,6:10])



accuracy.budget.difcost<-output.budget.difcost[,1:5]
colnames(accuracy.budget.difcost)<-c("Single-code 1","Single-code 2","Replicate","Remove differences","Expert resolves")

logloss.budget.difcost<-output.budget.difcost[,6:10]
colnames(logloss.budget.difcost)<-c("Single-code 1","Single-code 2","Replicate","Remove differences","Expert resolves")


boxplot(accuracy.budget.difcost,main="Accuracy under fixed budget")
boxplot(logloss.budget.difcost,main="Logloss under fixed budget")




#Some statistical test to see whether the differences in accuracy is significant
#We borrow the idea of bootstrap

n.bootstrap<-10000

set.seed(252)

SC1.boot<-accuracy.budget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),1]
SC2.boot<-accuracy.budget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),2]
Rep.boot<-accuracy.budget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),3]
Remove.boot<-accuracy.budget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),4]
Expert.boot<-accuracy.budget.difcost[sample(c(1:n.repeat),size=n.bootstrap,replace=TRUE),5]

SC1overRep<-SC1.boot>=Rep.boot
SC2overRep<-SC2.boot>=Rep.boot
SC1overRemove<-SC1.boot>=Remove.boot
SC2overRemove<-SC2.boot>=Remove.boot
SC1overExpert<-SC1.boot>=Expert.boot
SC2overExpert<-SC2.boot>=Expert.boot

RepoverRemove<-Rep.boot>=Remove.boot
RepoverExpert<-Rep.boot>=Expert.boot
RemoveoverExpert<-Remove.boot>=Expert.boot


mean(SC1overRep)
#1
mean(SC2overRep)
#1
mean(SC1overRemove)
#1
mean(SC2overRemove)
#1
mean(SC1overExpert)
#1
mean(SC2overExpert)
#1

mean(RepoverRemove)
#0.803
mean(RepoverExpert)
#0.9858
mean(RemoveoverExpert)
#0.9413



















#------------------------------
#presenting the result
#------------------------------


accuracy.nobudget<-output.nobudget[,c(1:5)]
names(accuracy.nobudget)<-c("SC1","SC2","Rep","Remove","Expert")
logloss.nobudget<-output.nobudget[,c(6:10)]
names(logloss.nobudget)<-c("SC1","SC2","Rep","Remove","Expert")

boxplot(accuracy.nobudget,main="Accuracy without considering budget")
boxplot(logloss.nobudget,main="Logloss without considering budget")

accuracy.budget<-output.budget[,c(1:4)]
names(accuracy.budget)<-c("SC","Rep","Remove","Expert")
logloss.budget<-output.budget[,c(5:8)]
names(logloss.budget)<-c("SC","Rep","Remove","Expert")

boxplot(accuracy.budget,main="Accuracy under fixed budget")
boxplot(logloss.budget,main="Logloss under fixed budget")




