Automatic building of logistic model.
by Irina 21. April 2007 09:19
Description: The data is a collection of information on colleges and universities ( only for example and not pretend to be real). The primary interest is in predicting of graduation . Potential predictor variables are tuition, income, wealth and grades on different subjectes- 200 rows .
The process:
1.
The first section of code splits the file into modeling and validation data sets. Validation sets constructed from the 50/50 stratified sample should be adequate for the purposes of this exercise. I took 95/5 only for example and because of very small data
DATA model_college;
SCAN: SET college end=eof;
N+1;
IF NOT eof THEN GOTO SCAN;
K=0.95*N; * K IS THE NUMBER TO RANDOMLY SELECT
IT MAY BE A FUNCTION OF N,
E.G.: K=.05*N FOR A 5 PERCENT SAMPLE;
LOOP: SET college ;
PROB=K/N; * PROB IS THE CURRENT SELECT PROBABILITY;
IF RANUNI(123467)>PROB THEN GOTO NEXT;
OUTPUT;
K=K-1; * THE OBSERVATION IS SELECTED;
NEXT: N=N-1; IF N>0 THEN GOTO LOOP;
RUN;
Proc sql;
create table validation_college as
select *
from college
quit;
There is a lot of other more "statistical" methods, like stepwise, to build model, but in the case that you have a big data that you don't familiar with in a good way with a lot of continious variables it can be effective .
2.
We build very dirty model whis continious variables.
proc logistic data= model_college namelen=200 outest=res_cont;
model rate=read write math science socst income wealth TUITION;
ods output ParameterEstimates = model_cont;
run;
Now we want to to throw not significant variables (in the real data we can request 0.05);
data model_cont1;
set model_cont ;
if ProbChiSq>0.1 then delete;
run;
3.
Now we want to categorize the continuous variables:
data model_cont1;
set model_cont1;
where compress(variable) ne 'Intercept';
RANKSS='r'||Variable ;
groupp='k'||Variable ;
run;
proc sql noprint;
select Variable,Variable,groupp,groupp,RANKSS,RANKSS ,
compress(put(count(*),best12.)) as counter
into :field separated BY " ",
:fieldp separated BY "+",
:groupp separated BY " ",
:grouppp separated BY "+",
:RANKSS separated BY " ",
:RANKSP separated BY "+ ",
:counter
from model_cont1;
quit;
%put &field;
%PUT &fieldp;
%PUT &groupp;
%PUT &grouppp;
%PUT &RANKSS;
%put &RANKSP;
%put &counter;
proc rank data= model_college groups=20
out=rankim_res(rename= rate=target );
var &field;
ranks &RANKSS;
run;
%macro category_def;
%do i=1 %to &counter;
%let k=&i;
proc means noprint data=rankim_res;
var target;
class %scan(&RANKSP,&i,+);
output out=a_&k mean=;
run;
proc sort data=a_&k(where=(_type_>0));
by target;
run;
PROC SQL;
CREATE TABLE a_&k AS
SELECT *,COUNT(*) AS SFIRA
FROM a_&k;
QUIT;
data a_&k;
set a_&k ;
if SFIRA<7 then %scan(&grouppp,&i,+)=%scan(&grouppp,&i,+);
else do;
if _n_<6 then %scan(&grouppp,&i,+)=1;
else if _n_<11 then %scan(&grouppp,&i,+)=2;
else if _n_<16 then %scan(&grouppp,&i,+)=3;
else %scan(&grouppp,&i,+)=4;
end;
run;
proc MEANS SUMWGT N MEAN data=a_&k;
VAR TARGET;
CLASS %scan(&grouppp,&i,+);
WEIGHT _FREQ_;
run;
%end;
%mend;
%category_def;
4.
The next step we want to return defined groups to the original data
%macro returned;
%do i=1 %to &counter;
data aa_&i; length rank_a_&i cat_a_&i $ 1000;
set a_&i end=eof;
retain rank_a_&i cat_a_&i;
rank_a_&i=compress(rank_a_&i||put(%scan(&RANKSP,&i,+),z2.)||'|');
cat_a_&i=compress(cat_a_&i||put(%scan(&grouppp,&i,+),z2.)||'|');
if eof then output;
keep rank_a_&i cat_a_&i;
run;
%end;
%mend;
%returned;
%macro fill;
%do i=1 %to %eval(&counter-1);
%let fill=aa_&i,;
&fill
%end;
%mend;
proc sql;
create table indexim as
select *
from %fill aa_&counter ;
quit;
data returned;
if _n_=1 then set indexim;
set rankim_res;
run;
data returned1;
set returned;
array vv1 &RANKSS;
array rr1 rank_a_1-rank_a_&counter;
array ct1 cat_a_1-cat_a_&counter;
array catout &groupp;
do i=1 to &counter;
catout[i]=substr(ct1[i],index(rr1[i],put(vv1[i],z2.)),2);
end;
run;
PROC FREQ DATA=returned1;
TABLES (&groupp)*target/chisq;
RUN;
5.
The next step we want to create formates in oder to categorize variables the next time without ranks.
%macro create_formats;
%do i=1 %to &counter;
proc means noprint nway data= returned1 missing;
var %scan(&fieldP,&i,+) ;
class %scan(&RANKsP,&i,+) %scan(&grouppp,&i,+);
output out=GV_&i min=start max=end;
run;
data GV_&i;
set GV_&i ;
fmtname="hahn_&i._ir";
label=%scan(&grouppp,&i,+);
run;
proc sort data=GV_&i;
by fmtname end ;
run;
data GV_&i;
set GV_&i;
by fmtname end;
if last.fmtname then end=100000000000000000000;
IF FIRST.fmtname THEN start=-10000000000000000;
run;
proc sort data= GV_&i;
by end;
run;
data GV_&i;
set GV_&i;
new_end=lag(end);
new_start=new_end ;
run;
data GV_&i;
set GV_&i (rename=start=start_old);
if new_start ne . then do ;
start=new_start;
end;
else do;
start=start_old;
end;
run;
proc sql;
create table GV_&i._5 as
select start,
end,
label,
fmtname ,
0.000000001 AS FUZZ
from GV_&i
where label >0;
quit;
proc format cntlin=GV_&i._5
lib=work;
run;
%end;
%mend;
%create_formats;
6.
The next step we want to create the formula to define the groups with use of formats.
data variables_for_models;
set model_cont1;
length sofi $150;
k =put(_n_,2.0);
fmt=compress('hahn_'||k||'_ir');
sofi=compress(groupp||'='||'put'||'('||variable||','||fmt||'.'||')'||';');
run;
proc sql noprint;
select sofi
into :formula_put separated BY " "
from variables_for_models;
quit;
%PUT &formula_put;
%macro formula_put;
%let formula_put =&formula_put ;
&formula_put ;
%mend;
data try;
set model_college(keep= &field) ;
%formula_put;
run;
7.
The next step we want to run the logistic model on categoric variables.
proc logistic data=returned1 namelen=200
descend ;
class &groupp
/ param=glm ;
model target=&groupp;
output out=toz_LOGISTIC_2 p=phat_new xbeta=xb;
ods output ParameterEstimates = estmation;
run;
8.
The next step we want to run the logistic model on categoric variables.
proc logistic data=returned1 namelen=200
descend ;
class &groupp
/ param=glm ;
model target=&groupp;
output out=toz_LOGISTIC_2 p=phat_new xbeta=xb;
ods output ParameterEstimates = estmation;
run;
9.
The last step we want save the formula of regression .
data estimate_save;
length nis1 $90;
set estmation end=eof;
if _n_=1 then nis1=compress(Estimate||'+');
else if abs(ClassVal0*1) >=0 then do;
nis1=compress('('||Estimate||')'||'* '||'('||Variable||'='||ClassVal0||')'||'+');
end;
else do;
nis1=compress(Estimate||'* '||'('||Variable||')'||'+');
end;
if eof then do;
if abs(ClassVal0*1) >=0 then do;
nis1=compress('('||Estimate||')'||'* '||'('||Variable||'='||ClassVal0||')');
end;
else do;
nis1=compress(Estimate||'* '||'('||Variable||')');
end;
end;
run;
proc sql noprint;
select nis1
into :formula_estim separated BY " "
from estimate_save;
quit;
%put &formula_estim;
%macro formula_estim;
%let formula_estim=&formula_estim;
&formula_estim;
%mend;
data try2;
set toz_LOGISTIC_2 ;
y=%formula_estim;
pp=exp(y)/(1+exp(y));
run;