Stata codes & tips
REGULAR SINTAX TO SHIFT GRAPH LEGENDS TO BOTTOM (6 o'clock) AND WITH 1 ROW
legend(rows(1) position(6))
STRIPPLOT GENERAL SYNTAX
stripplot soldprice, over(city) box iqr vertical xla(, ang(45)) stack msize(0.4pt) msymbol(smcircle) h(0) jitter(8)
*h(#) parameter shifts points to right, without jitter it makes points look like histogram
DISPLAYING N OF OBSERVATION PER GROUP ON THE STRIPPLOT
replace where = 0.4 //create a point in y axis that will display n of observations
egen count = count(probability41enc) if n_vaccines==0, by(lineage cleavage41enc) //create var that holds n of observations to be displayed accordin to the over and by of the stripplot below
stripplot probability41 if n_vaccines==0, over(cleavage41enc) by(lineage) box iqr vertical xla(, ang(45)) stack msize(1pt) msymbol(square) h(0.5) ///
addplot(scatter where cleavage41enc, mla(count) ms(none) mlabpos(0) mlabsize(small)) //add a layer to the stripplot on the where coordinates with the count values
CREATING MACRO FOR CURRENT DATE (USEFUL FOR SAVING FILES)
local suffix: display %tdDmCY =daily("`c(current_date)'", "DMY")
di `suffix'
CREATING INDICATOR OF ADVANCE IN A LOOP
count
local obs = r(N)
local indicator = 1
foreach lev in `r(levels)' {
whatever command
di "`indicator' of `obs'"
local indicator = `indicator'+1
}
AVOIDING GRAY LINES AROUND GRAPHS
graphregion(color (white) fcolor(white) ifcolor(white))
BOXES FOR SYMBOLS IN GRAPHS
symy(3) symx(4)
SIZES OF LEGENDS
legend(on symxsize(4) symysize(4))
RETRIEVING AND USING STORED RESULTS
count
return list //try running after commands other than count to see if they have stored results
local number = r(N)
CREATING BOOKMARKS ON DO FILE
**# FIRST LEVEL BOOKMARK
**## SECOND LEVEL BOOKMARK
**### THIRD LEVEL BOOKMARK, AND SO ON
*SELECTABLE LIST OF BOOKMARS SHOWS UP ON BOTTOM LEFT CORNER OF DO FILE
LOOPING OVER ALL VARIABLES
foreach v of varlist var* {
do something if `v'
}
LOOPING THROUGH A LIST OF VARIABLES
foreach v of varlist xxx - yyy {
do something if `v'
}
LOOPING OVER ALL VALUES OF A VARIABLE
levelsof id
foreach lev in `r(levels)' {
gen test`lev' = 1 if id == `lev'
}
DIRECTLY IMPORTING DATA FROM THE INTERNET - SEVERAL DIFFERENT APPROACHES THAT *MAY* WORK
gen s = ""
replace s = fileread("LINK_TO_WEBPAGE")
readhtmltable LINK_TO_WEBPAGE // notice no quotation marks
readhtmllist LINK_TO_WEBPAGE // notice no quotation marks
clear
import delimited "LINK_TO_WEBPAGE.txt" // good for txt, csv, tsv... files
DOING AN ACTION TO ALL FILES OF CERTAIN EXTENSION IN A FOLDER
TRANSFORMING ALL CSV INTO DTA
local files : dir "PATH_TO_FOLDER" files "*.csv"
foreach file in `files' {
clear
import delimited "`file'"
save "`file'.dta", replace
}
APPENDING ALL DTA
local files : dir "PATH_TO_FOLDER" files "*.dta"
foreach file in `files' {
append using "`file'"
capture gen file = ""
capture replace file = "`file'" if file == ""
}
OPENING FASTA FILES ON STATA
import delimited "PATH_TO_YOUR_FASTA_FILE\FILENAME.fasta", delimiter(whitespace)
rename v1 fastaid //give whatever name you want
gen trim_ntseq = fastaid[_n+1]
gen char = strlen(fastaid)
drop if char > 20 //depending on how long your sequences ID actually are. Run a tab char prior to dropping and identify a good cutoff point
compress
drop char
ASSIGNING FIRST OBSERVATION AS VAR NAMES
ds // https://www.stata.com/manuals/dds.pdf for options
capture renvars `r(varlist)' , map(strtoname(@[1]))
drop in 1"
CREATE AND ERASE DIRECTORIES
mkdir "PATH\NAME_NEW_DIR"
erase "PATH"
USING STOPBOXES AND IF COMMAND (NOT IF QUALIFIER)
*CREATE POOR LENGTH WARNING - THIS IS JUST AND EXAMPLE OF A CODE
count if (char!=603 & char!=606)
local i = r(N)
if(`i'!=0){ then
capture window stopbox rusure "There are `i' sequences with inadequate length. They may or may not be typed. Do you wish to continue?"
if _rc == 1 {
exit
else {
}
}
else{
}
}
TODAY DATE
gen date = date("$S_DATE","DMY")
COMPARING MULTIPLE REGRESSION MODELS
logistic outbreak var1 var2 var3##var4 var5##var6 var7##var8 var9, base or
estimates store test1
logistic outbreak var1 var2 var3 var4 var5##var6 var7##var8 var9, base or
estimates store test2
(...)
estimates table test1 test2, eform base star stats(aic bic N r2_p)
F1 SCORE IN STATA
logistic command
estat classification, cutoff(0.5)
di 2*((r(P_1p)*r(P_p1))/(r(P_1p)+r(P_p1)))
USING FRAMES STATA
frame create xxx //xxx is the name of the new frame (default frame is called "default")
frame change xxx
frame drop xxx
sysuse auto.dta
logistic foreign price mpg headroom trunk, base or
frame create f1 cutoff f1score sens esp ppv npv accuracy
forvalues i = 0(0.01)1{
estat classification, cutoff(`i')
local f = 2*((r(P_1p)*r(P_p1))/(r(P_1p)+r(P_p1)))
frame post f1 (`i') (`f') (r(P_p1)) (r(P_n0)) (r(P_1p)) (r(P_0n)) (r(P_corr))
}
frame change f1
twoway (line f1score cutoff) (line accuracy cutoff), ylabel(0(10)100) xlabel(0(0.1)1) xmtick(0(0.01)1)
frame change default
CLASSIFICATION (CATHEGORICAL) RANDOM FOREST IN STATA
RANDOM FOREST IN STATA
SOURCE:
https://www.stata.com/meeting/canada18/slides/canada18_Zou.pdf
FOR REGRESSION (CONTINUOUS) RANDOM FOREST IN STATA, CHECK LINK ABOVE.
*DOWNLOAD THE FOLLOWING FILE ON YOUR COMPUTER:
https://drive.google.com/file/d/1HATpqf-tQ_3l0DOUdVmhRVH8_vbIydXr/view?usp=share_link
clear
import delimited "PATH TO FILE YOU JUST DOWNLOADED"
set seed 201807
*discarding obs to make this run faster. On a real scenario, use all data
gen discard = uniform()
sort discard
gen n = _n
drop if n > 1000
*fixing marriage for dummy
tab MARRIAGE, gen(m)
*interesting setup that records errors for i iterations (in this case, number of trees)
gen u=uniform()
sort u
gen out_of_bag_error1 = .
gen validation_error = .
gen iter1 = .
local j = 0
forvalues i = 10(5)500{
local j = `j' + 1
rforest defaultpaymentnextmonth LIMIT_BAL SEX ///
EDUCATION m* AGE PAY* BILL* in 1/500, ///
type(class) iter(`i') numvars(1)
replace iter1 = `i' in `j'
replace out_of_bag_error1 = `e(OOB_Error)' in `j'
predict p in 501/1000
replace validation_error = `e(error_rate)' in `j'
drop p
display `i'
}
twoway (scatter out_of_bag_error1 iter1) (scatter validation_error iter1)
*500 was chosen as iter number because graph above shows that's more or less where error levels off and stabilizes
*interesting setup that records errors for i numvars (in this case, number of trees)
gen oob_error = .
gen nvars = .
gen val_error = .
local j = 0
forvalues i = 1(1)26{
local j = `j' + 1
rforest defaultpaymentnextmonth LIMIT_BAL SEX ///
EDUCATION m* AGE PAY* BILL* ///
in 1/500, type(class) iter(500) numvars(`i')
replace nvars = `i' in `j'
replace oob_error = `e(OOB_Error)' in `j' //gets the OOB erros from the rforest command (run only on the train part of the data)
predict p in 501/1000
replace val_error = `e(error_rate)' in `j' //gets the error_rate from the predict command (run only on the test part of the data)
drop p
display `i'
}
twoway (scatter out_of_bag_error1 nvars) (scatter validation_error nvars)
*12 was chosen as final numvars because graph above shows that'w the minimum validation error
*Finalized model is:
rforest defaultpaymentnextmonth LIMIT_BAL SEX ///
EDUCATION m* AGE PAY* BILL* ///
in 1/500, type(class) iter(1000) numvars(12)
*Compute expected classes of variable defaultpaymentnextmonth
predict p1
*Compute expected class probabilities of variable defaultpaymentnextmonth
predict c1 c2, pr
*VARIABLE IMPORTANCE PLOT
matrix importance = e(importance)
*convert matrix into variable
svmat importance
list importance in 1/5
gen id=""
local mynames : rownames importance
local k : word count `mynames'
// If there are more variables than observations
if `k'>_N {
set obs `k'
}
forvalues i = 1(1)`k' {
local aword : word `i' of `mynames'
local alabel : variable label `aword'
if ("`alabel'"!="") qui replace id= "`alabel'" in `i'
else qui replace id= "`aword'" in `i'
}
graph hbar (mean) importance, over(id, sort(1)) ytitle(Importance)