Logistic model
 

Automatic building of logistic model.
by Irina 21. April 2007 09:19
Description: The data is a collection of information on colleges and universities ( only for example and not pretend to be real). The primary interest is in predicting of graduation . Potential predictor variables are tuition, income, wealth and grades on different subjectes-  200 rows .

The process:
1.
The first section of code splits the file into modeling and validation data sets. Validation sets constructed from the 50/50 stratified sample should be adequate for the purposes of this exercise. I took 95/5 only for example and because of very small data

   
DATA model_college;   
                                        
 SCAN: SET  college  end=eof;
        N+1;
        IF NOT eof THEN GOTO SCAN;
        K=0.95*N;       * K IS THE NUMBER TO RANDOMLY SELECT 
                               IT MAY BE A FUNCTION OF N,
                               E.G.: K=.05*N FOR A 5 PERCENT SAMPLE;
 LOOP: SET  college ;
       PROB=K/N;        * PROB IS THE CURRENT SELECT PROBABILITY;
       IF RANUNI(123467)>PROB THEN GOTO NEXT;
       OUTPUT;
       K=K-1;           * THE OBSERVATION IS SELECTED;
 NEXT: N=N-1; IF N>0 THEN GOTO LOOP;
       RUN;

Proc sql;
create table validation_college as
select *               
from college 
quit; 

There is a lot of other more "statistical" methods, like stepwise, to build model, but in the case that you have a big data that you don't familiar with in a good way with a lot of continious variables it can be effective .

2.
We build very dirty model whis continious variables.

    
proc logistic data= model_college  namelen=200 outest=res_cont;
model rate=read  write math science socst income wealth TUITION;

ods output ParameterEstimates = model_cont;
run;

    
Now we want to to throw not significant variables (in the real data we can request 0.05);

    
data model_cont1;
set  model_cont ;

if ProbChiSq>0.1 then delete;
run;

    
3.
Now we want to categorize the continuous variables:

    
 data model_cont1;
 set model_cont1;
 where compress(variable) ne 'Intercept';
 RANKSS='r'||Variable ;
 groupp='k'||Variable ;
 run;
 
proc sql noprint;
select Variable,Variable,groupp,groupp,RANKSS,RANKSS ,
compress(put(count(*),best12.)) as counter
into :field separated BY " ",
     :fieldp separated BY "+",
     :groupp separated BY " ",
     :grouppp separated BY "+",
     :RANKSS separated BY " ",
     :RANKSP  separated BY "+ ",
  :counter
from model_cont1;
quit;

%put &field;
%PUT &fieldp;
%PUT &groupp;
%PUT &grouppp;
%PUT &RANKSS;
%put &RANKSP;
%put &counter;


proc rank data= model_college  groups=20
out=rankim_res(rename= rate=target );
var &field;
ranks &RANKSS;
run;

%macro category_def;
%do i=1 %to &counter;
%let k=&i;
proc means noprint data=rankim_res;
var target;
class %scan(&RANKSP,&i,+);
output out=a_&k mean=;
run;
proc sort data=a_&k(where=(_type_>0));
by target;
run;
PROC SQL;
 CREATE TABLE a_&k AS
 SELECT *,COUNT(*) AS SFIRA
 FROM a_&k;
 QUIT;

data a_&k;
set a_&k ;
if SFIRA<7 then %scan(&grouppp,&i,+)=%scan(&grouppp,&i,+);
else do;
if _n_<6       then %scan(&grouppp,&i,+)=1;
else if _n_<11 then %scan(&grouppp,&i,+)=2;
else if _n_<16 then %scan(&grouppp,&i,+)=3;
else %scan(&grouppp,&i,+)=4;
end;
run;
proc MEANS SUMWGT  N MEAN   data=a_&k;
VAR TARGET;
CLASS %scan(&grouppp,&i,+);
WEIGHT _FREQ_;
run;
%end;
%mend;
%category_def;

    
4.
The next step we want to return defined groups to the original data

    
%macro returned;
%do i=1 %to &counter;
data aa_&i; length rank_a_&i cat_a_&i $ 1000;
set a_&i end=eof;
retain rank_a_&i cat_a_&i;
rank_a_&i=compress(rank_a_&i||put(%scan(&RANKSP,&i,+),z2.)||'|');
cat_a_&i=compress(cat_a_&i||put(%scan(&grouppp,&i,+),z2.)||'|');
if eof then output;
keep rank_a_&i cat_a_&i;
run;
%end;
%mend;
%returned;

%macro fill;
%do i=1 %to %eval(&counter-1);
%let fill=aa_&i,;
&fill
%end;
%mend;


proc sql;
create table indexim as
select *
from %fill  aa_&counter ;
quit;


data returned;
if _n_=1 then set indexim;
set rankim_res;
run;

 

data returned1;
set  returned;
 
array vv1  &RANKSS; 
array rr1 rank_a_1-rank_a_&counter;
array ct1 cat_a_1-cat_a_&counter;
array catout  &groupp;
do i=1 to &counter;
catout[i]=substr(ct1[i],index(rr1[i],put(vv1[i],z2.)),2);
end;
run;


PROC FREQ DATA=returned1;
TABLES (&groupp)*target/chisq;
RUN;

    
5.
The next step we want to create formates in oder to categorize variables the next time without ranks.

    
%macro create_formats;
%do i=1 %to &counter;
proc means noprint nway data= returned1 missing;
var    %scan(&fieldP,&i,+) ;
class  %scan(&RANKsP,&i,+) %scan(&grouppp,&i,+);
output out=GV_&i  min=start max=end;
run;


data GV_&i;
set GV_&i ;
fmtname="hahn_&i._ir";
label=%scan(&grouppp,&i,+);
run;

proc sort data=GV_&i;
by fmtname end ;
run;

data GV_&i;
set GV_&i;
by fmtname end;
if last.fmtname then end=100000000000000000000;
IF FIRST.fmtname THEN start=-10000000000000000;
run;
proc sort data= GV_&i;
by end;
run;
data GV_&i;
set GV_&i;
new_end=lag(end);
new_start=new_end ;
run;

data GV_&i;
set GV_&i (rename=start=start_old);
if new_start ne . then do ;
start=new_start;
end;
else do;
start=start_old;
end;
run;
proc sql;
create table GV_&i._5 as
select start,
       end,
       label,
       fmtname ,
       0.000000001 AS FUZZ
from  GV_&i
where label >0;
quit;

proc format cntlin=GV_&i._5
lib=work;
run;
%end;
%mend;
%create_formats;

    
6.
The next step we want to create the formula to define the groups with use of formats.

    
data variables_for_models;
set model_cont1;
length sofi $150;
k =put(_n_,2.0); 
fmt=compress('hahn_'||k||'_ir');
sofi=compress(groupp||'='||'put'||'('||variable||','||fmt||'.'||')'||';');
 
run;

proc sql noprint;
select sofi
into :formula_put separated BY " "
    
from  variables_for_models;
quit;

%PUT &formula_put;

%macro formula_put;
%let formula_put =&formula_put ;
 &formula_put ;
 %mend;

 data try;
set model_college(keep= &field) ;
 %formula_put;
run;

    
7.
The next step we want to run the logistic model on categoric variables.

    
proc logistic data=returned1   namelen=200
descend    ;
class &groupp
/  param=glm ;
model target=&groupp;
output out=toz_LOGISTIC_2 p=phat_new xbeta=xb;
ods output ParameterEstimates =  estmation;
run;
    
8.
The next step we want to run the logistic model on categoric variables.

proc logistic data=returned1   namelen=200
descend    ;
class &groupp
/  param=glm ;
model target=&groupp;
output out=toz_LOGISTIC_2 p=phat_new xbeta=xb;
ods output ParameterEstimates =  estmation;
run;
      
9.
The last step we want save the formula of regression .

data  estimate_save;
length nis1 $90;
set estmation end=eof;

if _n_=1 then nis1=compress(Estimate||'+');
 
else if   abs(ClassVal0*1) >=0  then do;
nis1=compress('('||Estimate||')'||'* '||'('||Variable||'='||ClassVal0||')'||'+');
end;
else do;
nis1=compress(Estimate||'* '||'('||Variable||')'||'+');
end;
if eof then do;

if   abs(ClassVal0*1) >=0  then do;
nis1=compress('('||Estimate||')'||'* '||'('||Variable||'='||ClassVal0||')');
end;
else do;
nis1=compress(Estimate||'* '||'('||Variable||')');
end;

end;


run; 


proc sql noprint;
select nis1
into :formula_estim separated BY " "
    
from estimate_save;
quit;


%put &formula_estim;

 

%macro formula_estim;
%let formula_estim=&formula_estim;
 &formula_estim;
%mend;

data try2;
set toz_LOGISTIC_2  ;

y=%formula_estim;
pp=exp(y)/(1+exp(y));

run;