// Joe Dutch data
// text : Text variable
// category:  Category (y)
// instance :  Identifier
// nohouse_encr,  nomem_enc: Identifer variables to link to the LISS data

version 13.0
clear 

// Install boost
// install ngram
// e.g. net search ngram        , Then follow the links

set scheme        s1color    
set copycolor     asis       
set memory 100m
set more off
set seed 1
set matsize 8000
discard
cap log close 
log using patient_joe_boost, replace
***************************************************************************
use joe_dutch_categorized
gen u=uniform()
sort u
describe
****************************************************************************
set locale_functions nl
ngram text , degree(2) threshold(5) sub(4 1 1) binarize 
****************************************************************************
//boosting and predicted category
cap drop boost_pred1-boost_pred4
profiler on
boost category n_token t_*  in 1/500, dist(multinomial) train(0.8) ///
	maxiter(3000) bag(.5) interaction(5) shrink(.1) pred("boost_pred") influence seed(1)
profiler off 
profiler report
profiler clear


// confidence of (training) predictions
gen pred_max= max(boost_pred1, boost_pred2, boost_pred3, boost_pred4)
gen predcat = .
replace predcat=1 if pred_max==boost_pred1
replace predcat=2 if pred_max==boost_pred2
replace predcat=3 if pred_max==boost_pred3
replace predcat=4 if pred_max==boost_pred4
label values predcat category
tab category predcat, row
******************************************************************************
* influence
matrix influence=e(influence)
svmat influence
gen id=_n
replace id=. if influence1==.
//graph bar (mean) influence1, over(id) ytitle(Percentage Influence) 
count if (influence1>0.00 | influence2>0.00 |influence3>0.00 | influence4>0.00 )  & influence1!=.


// names of influential variables
local names : rownames influence
gen names=""
forvalues i=1/`: word count `names'' {
  qui replace names=`"`: word `i' of `names''"' in `i'
}
gen influence_max= max(influence1,influence2,influence3,influence4)
list names influence* if influence_max>5 & !missing(influence1)

******************************************************************************
* accuracy
cap program drop accuracy
do ../expected_accuracy.do
gen temp1=category
gen temp2=category

accuracy pred_max category predcat in 501/1758, p_man(.8) 

save joe_dutch_merged_boost, replace

*******************************************************************************
* list most influential variables
preserve
drop if influence_max==0
gsort - influence_max
outsheet names influence_max influence1 influence2 influence3 influence4 ///
	using influence_names if influence_max>0 &!missing(influence_max), ///
	nonames noquote replace
restore
*******************************************************************************
cap log close 
