Stata codes & tips

REGULAR SINTAX TO SHIFT GRAPH LEGENDS TO BOTTOM (6 o'clock) AND WITH 1 ROW

legend(rows(1) position(6))

STRIPPLOT GENERAL SYNTAX

stripplot soldprice, over(city) box iqr vertical xla(, ang(45)) stack msize(0.4pt) msymbol(smcircle) h(0) jitter(8)

*h(#) parameter shifts points to right, without jitter it makes points look like histogram


DISPLAYING N OF OBSERVATION PER GROUP ON THE STRIPPLOT

replace where = 0.4 //create a point in y axis that will display n of observations

egen count = count(probability41enc) if n_vaccines==0, by(lineage cleavage41enc) //create var that holds n of observations to be displayed accordin to the over and by of the stripplot below

stripplot probability41 if n_vaccines==0, over(cleavage41enc) by(lineage) box iqr vertical xla(, ang(45)) stack msize(1pt) msymbol(square) h(0.5) ///

addplot(scatter where cleavage41enc, mla(count) ms(none) mlabpos(0) mlabsize(small)) //add a layer to the stripplot on the where coordinates with the count values

CREATING MACRO FOR CURRENT DATE (USEFUL FOR SAVING FILES)

local suffix: display %tdDmCY =daily("`c(current_date)'", "DMY")

di `suffix'

CREATING INDICATOR OF ADVANCE IN A LOOP

count

local obs = r(N)

local indicator = 1

foreach lev in `r(levels)' {

       whatever command

       di "`indicator' of `obs'"

       local indicator = `indicator'+1

}

AVOIDING GRAY LINES AROUND GRAPHS

graphregion(color (white) fcolor(white) ifcolor(white)) 


BOXES FOR SYMBOLS IN GRAPHS

symy(3) symx(4)


SIZES OF LEGENDS

legend(on symxsize(4) symysize(4)) 

RETRIEVING AND USING STORED RESULTS

count

return list //try running after commands other than count to see if they have stored results

local number = r(N)

CREATING BOOKMARKS ON DO FILE

**# FIRST LEVEL BOOKMARK

**## SECOND LEVEL BOOKMARK

**### THIRD LEVEL BOOKMARK, AND SO ON

*SELECTABLE LIST OF BOOKMARS SHOWS UP ON BOTTOM LEFT CORNER OF DO FILE

LOOPING OVER ALL VARIABLES

foreach v of varlist var* {  

   do something if `v'

}

LOOPING THROUGH A LIST OF VARIABLES

foreach v of varlist xxx - yyy {  

   do something if `v'

}

LOOPING OVER ALL VALUES OF A VARIABLE

levelsof id

foreach lev in `r(levels)' {

gen test`lev' = 1 if id == `lev'

}

DIRECTLY IMPORTING DATA FROM THE INTERNET - SEVERAL DIFFERENT APPROACHES THAT *MAY* WORK

gen s = ""

replace s = fileread("LINK_TO_WEBPAGE")


readhtmltable LINK_TO_WEBPAGE // notice no quotation marks


readhtmllist LINK_TO_WEBPAGE // notice no quotation marks


clear

import delimited "LINK_TO_WEBPAGE.txt" // good for txt, csv, tsv... files

DOING AN ACTION TO ALL FILES OF CERTAIN EXTENSION IN A FOLDER

TRANSFORMING ALL CSV INTO DTA

local files : dir "PATH_TO_FOLDER" files "*.csv"

foreach file in `files' {

clear

import delimited "`file'"

save "`file'.dta", replace

}

APPENDING ALL DTA

local files : dir "PATH_TO_FOLDER" files "*.dta"

foreach file in `files' {

append using "`file'"

capture gen file = ""

capture replace file = "`file'" if file == ""

}

OPENING FASTA FILES ON STATA

import delimited "PATH_TO_YOUR_FASTA_FILE\FILENAME.fasta", delimiter(whitespace) 

rename v1 fastaid //give whatever name you want

gen trim_ntseq = fastaid[_n+1]

gen char = strlen(fastaid)

drop if char > 20 //depending on how long your sequences ID actually are. Run a tab char prior to dropping and identify a good cutoff point

compress

drop char

ASSIGNING FIRST OBSERVATION AS VAR NAMES

ds // https://www.stata.com/manuals/dds.pdf for options

capture renvars `r(varlist)' , map(strtoname(@[1]))

drop in 1"

CREATE AND ERASE DIRECTORIES

mkdir "PATH\NAME_NEW_DIR"

erase "PATH"

USING STOPBOXES AND IF COMMAND (NOT IF QUALIFIER)

*CREATE POOR LENGTH WARNING - THIS IS JUST AND EXAMPLE OF A CODE

count if (char!=603 & char!=606)

local i = r(N)

if(`i'!=0){ then

capture window stopbox rusure "There are `i' sequences with inadequate length. They may or may not be typed. Do you wish to continue?"

if _rc == 1 {

exit

else {

}

}

else{

}

}

TODAY DATE

gen date = date("$S_DATE","DMY")

COMPARING MULTIPLE REGRESSION MODELS

logistic outbreak var1 var2 var3##var4 var5##var6 var7##var8 var9, base or

estimates store test1

logistic outbreak var1 var2 var3    var4 var5##var6 var7##var8 var9, base or

estimates store test2

(...)

estimates table test1 test2, eform base star stats(aic bic N r2_p)

F1 SCORE IN STATA

logistic command

estat classification, cutoff(0.5)

di 2*((r(P_1p)*r(P_p1))/(r(P_1p)+r(P_p1)))

USING FRAMES STATA

frame create xxx //xxx is the name of the new frame (default frame is called "default")

frame change xxx

frame drop xxx


sysuse auto.dta

logistic foreign price mpg headroom trunk, base or

frame create f1 cutoff f1score sens esp ppv npv accuracy

forvalues i = 0(0.01)1{

estat classification, cutoff(`i')

local f = 2*((r(P_1p)*r(P_p1))/(r(P_1p)+r(P_p1)))

frame post f1 (`i') (`f') (r(P_p1)) (r(P_n0)) (r(P_1p)) (r(P_0n)) (r(P_corr))

}

frame change f1

twoway (line f1score cutoff) (line accuracy cutoff), ylabel(0(10)100) xlabel(0(0.1)1) xmtick(0(0.01)1)

frame change default

CLASSIFICATION (CATHEGORICAL) RANDOM FOREST IN STATA

RANDOM FOREST IN STATA

SOURCE:

https://www.stata.com/meeting/canada18/slides/canada18_Zou.pdf

FOR REGRESSION (CONTINUOUS) RANDOM FOREST IN STATA, CHECK LINK ABOVE.


*DOWNLOAD THE FOLLOWING FILE ON YOUR COMPUTER:

https://drive.google.com/file/d/1HATpqf-tQ_3l0DOUdVmhRVH8_vbIydXr/view?usp=share_link


clear

import delimited "PATH TO FILE YOU JUST DOWNLOADED"


set seed 201807 


*discarding obs to make this run faster. On a real scenario, use all data

gen discard = uniform()

sort discard

gen n = _n

drop if n > 1000


*fixing marriage for dummy

tab MARRIAGE, gen(m)


*interesting setup that records errors for i iterations (in this case, number of trees)

gen u=uniform() 

sort u 

gen out_of_bag_error1 = . 

gen validation_error = . 

gen iter1 = . 

local j = 0


forvalues i = 10(5)500{

local j = `j' + 1 

rforest defaultpaymentnextmonth LIMIT_BAL SEX /// 

EDUCATION m* AGE PAY* BILL* in 1/500, /// 

type(class) iter(`i') numvars(1) 

replace iter1 = `i' in `j' 

replace out_of_bag_error1 = `e(OOB_Error)' in `j' 

predict p in 501/1000 

replace validation_error = `e(error_rate)' in `j' 

drop p 

display `i'

}


twoway (scatter out_of_bag_error1 iter1) (scatter validation_error iter1)

*500 was chosen as iter number because graph above shows that's more or less where error levels off and stabilizes


*interesting setup that records errors for i numvars (in this case, number of trees)

gen oob_error = . 

gen nvars = . 

gen val_error = . 

local j = 0 

forvalues i = 1(1)26{ 

local j = `j' + 1 

rforest defaultpaymentnextmonth LIMIT_BAL SEX /// 

EDUCATION m* AGE PAY* BILL* /// 

in 1/500, type(class) iter(500) numvars(`i') 

replace nvars = `i' in `j' 

replace oob_error = `e(OOB_Error)' in `j' //gets the OOB erros from the rforest command (run only on the train part of the data)

predict p in 501/1000

replace val_error = `e(error_rate)' in `j' //gets the error_rate from the predict command (run only on the test part of the data)

drop p 

display `i'

}


twoway (scatter out_of_bag_error1 nvars) (scatter validation_error nvars)

*12 was chosen as final numvars because graph above shows that'w the minimum validation error


*Finalized model is: 

rforest defaultpaymentnextmonth LIMIT_BAL SEX /// 

EDUCATION m* AGE PAY* BILL* /// 

in 1/500, type(class) iter(1000) numvars(12)


*Compute expected classes of variable defaultpaymentnextmonth

predict p1


*Compute expected class probabilities of variable defaultpaymentnextmonth

predict c1 c2, pr



*VARIABLE IMPORTANCE PLOT

matrix importance = e(importance)


*convert matrix into variable

svmat importance


list importance in 1/5

gen id=""

local mynames : rownames importance

local k : word count `mynames'

// If there are more variables than observations

if `k'>_N {

set obs `k'

}

forvalues i = 1(1)`k' {

local aword : word `i' of `mynames'

local alabel : variable label `aword'

if ("`alabel'"!="") qui replace id= "`alabel'" in `i'

else qui replace id= "`aword'" in `i'

}


graph hbar (mean) importance, over(id, sort(1)) ytitle(Importance)