WilcxGoodnessFit.sas

/******************************************************************

* Program Name: WilGoodnessFit.sas *

* Program Dir : /udd/nhron/macrodevelopment/ *

* Program Date: Jan. 2008 *

* *

* Macro Variables: *

* *

* maindsn : data that contains id,subgrp,outcome, *

* score1 from model1, score2 from model2 *

* outcome: outcome 1: cases; 0: controls *

* subgrp : subgrp coding should be 1, 2 and etc. *

* id : id *

* *

* Note1: This program is to use the Mann-Whitney *

* U statistic approach to compare the C *

* statistic for *

* two different risk prediction models on the *

* the same dataset *

* Note2: *

* Normality of the risk score distribution *

* is not required to use this method *

* Based on paper "Power and sample size estimation *

* for the Wilcoxon Rank Sum Test" *

* by B. Rosner and R. Glynn *

* Programmed by: Rong Chen and Marion McPhee *

****************************************************************/

%macro WilGoodnessFit(maindsn=,

outcome=,

subgrp=,

id=);

%if &subgrp ne %then %do;

proc sql;

select count(distinct(&subgrp)) into: s

from &maindsn;

quit;

data _null_;

call symput('n', trim(left(&s)));

run;

%EstTheta(dsn=&maindsn, mdlnum=1);

%EstTheta(dsn=&maindsn, mdlnum=2);

data both;

merge rnkall1 rnkall2 end=eof;

by &subgrp &id;

if _n_=1 then set numcnt;

if _n_=1 then set numcas;

retain cs1-cs&n;

retain cn1-cn&n;

array ba(s) cn1-cn&n;

array aa(s) cs1-cs&n;

s=&subgrp;

ncase=aa;

ncntl=ba;

probit_a=probit(rscore_all1/(ncase+ncntl+1));

probit_b=probit(rscore_all2/(ncase+ncntl+1));

run;

data case;

set both;

if &outcome=1;

run;

proc sort; by &subgrp;

proc corr outp=outa; by &subgrp;

var probit_a probit_b;

run;

data control;

set both;

if &outcome=0;

run;

proc sort; by &subgrp;

proc corr outp=outb; by &subgrp;

var probit_a probit_b;

run;

data outa;

set outa;

if _TYPE_='CORR' and _NAME_='probit_b';

px=probit_a;

keep px &subgrp;

run;

data outb;

set outb;

if _TYPE_='CORR' and _NAME_='probit_b';

py=probit_a;

keep py &subgrp;

run;

data both2;

merge temp1 temp2 outa outb; by &subgrp;

if _n_=1 then set numcnt;

if _n_=1 then set numcas;

retain cs1-cs&n;

retain cn1-cn&n;

array ba(s) cn1-cn&n;

array aa(s) cs1-cs&n;

s=&subgrp;

ncase=aa;

ncntl=ba;

covar=(probbnrm(prob_theta_1,prob_theta_2,(px+py)/2)-theta_1*theta_2

+(ncntl-1)*(probbnrm(prob_theta_1,prob_theta_2,py/2)-theta_1*theta_2)

+(ncase-1)*(probbnrm(prob_theta_1,prob_theta_2,px/2)-theta_1*theta_2))

/(ncase*ncntl);

var_theta_diff=(var_theta_1+var_theta_2)-2*covar;

se_theta_diff=sqrt(var_theta_diff);

wt_theta_diff=1.0/var_theta_diff;

theta_diff=theta_1-theta_2;

wttheta_1=1.0/var_theta_1;

wttheta_2=1.0/var_theta_2;

/* create variable for output */

theta1ot =compress(put(theta_1,6.3)||'+/-'||put(se_theta_1,6.3));

theta2ot =compress(put(theta_2,6.3)||'+/-'||put(se_theta_2,6.3));

thetadiffot =compress(put(theta_diff,6.3)||'+/-'||put(se_theta_diff,6.3));

run;

proc print label;

var &subgrp ncase ncntl theta1ot theta2ot thetadiffot;

label ncase ='# of Cases'

ncntl ='# of Controls'

theta1ot ='Theta +/- se for model1'

theta2ot ='Theta +/- se for model2'

thetadiffot='Defference of Theta from 2 models';

title 'Both models';

run;

/* compute the test statistics */

data allages;

set both2 end=eof;

retain suma sumb sumc sumd sume sumf sncase sncntl 0;

if theta_1 ne . then suma=suma+wttheta_1*theta_1;

if wttheta_1 ne . then sumb=sumb+wttheta_1;

if theta_2 ne . then sumc=sumc+wttheta_2*theta_2;

if wttheta_2 ne . then sumd=sumd+wttheta_2;

if wt_theta_diff ne . then sume=sume+theta_diff*wt_theta_diff;

if wt_theta_diff ne . then sumf=sumf+wt_theta_diff;

if ncase ne . then sncase=sncase+ncase;

sncntl=sncntl+ncntl;

if eof then do;

theta1_summary=suma/sumb;

se_theta1_summary=sqrt(1.0/sumb);

theta2_summary=sumc/sumd;

se_theta2_summary=sqrt(1.0/sumd);

wt_theta1_summary=sumb;

wt_theta2_summary=sumd;

theta_diff_summary=sume/sumf;

se_theta_summary=sqrt(1.0/sumf);

wt_theta_summary=sumf;

tncase=sncase;

tncntl=sncntl;

z=theta_diff_summary/se_theta_summary;

pvalue=2*(1.0-probnorm(abs(z)));

drop suma sumb sumc sumd sume sumf;

output;

suma=0;

sumb=0;

sumc=0;

sumd=0;

sume=0;

sumf=0;

sncase=0;

sncntl=0;

end;

run;

data allages;

set allages;

theta1ots =put(round(theta1_summary, 0.001),6.3)||'+/-'||put(round(se_theta1_summary,0.001),6.3);

theta2ots =put(round(theta2_summary, 0.001),6.3)||'+/-'||put(round(se_theta2_summary,0.001),6.3);

thetadiffots =put(round(theta_diff_summary,0.001),6.3)||'+/-'||put(round(se_theta_summary,0.001),6.3);

run;

proc print data=allages label;

var tncase tncntl

theta1ots theta2ots thetadiffots pvalue;

label tncase ='# of Cases Summary'

tncntl ='# of Controls Summary'

theta1ots ='Theta +/- se for model1 Summary'

theta2ots ='Theta +/- se for model2 Summary'

thetadiffots='Defference of Theta from 2 models Summary';

run;

%end;

%else %if &subgrp=%str() %then %do;

%EstTheta1(dsn=&maindsn, mdlnum=1);

%EstTheta1(dsn=&maindsn, mdlnum=2);

data both;

merge rnkall1 rnkall2 end=eof;

by &id;

if _n_=1 then set numcnt;

if _n_=1 then set numcas;

retain cs1;

retain cn1;

ncase=cs1;

ncntl=cn1;

probit_a=probit(rscore_all1/(ncase+ncntl+1));

probit_b=probit(rscore_all2/(ncase+ncntl+1));

run;

data case;

set both;

if &outcome=1;

run;

proc corr outp=outa;

var probit_a probit_b;

run;

data control;

set both;

if &outcome=0;

run;

proc corr outp=outb; by &subgrp;

var probit_a probit_b;

run;

data outa;

set outa;

if _TYPE_='CORR' and _NAME_='probit_b';

px=probit_a;

keep px ;

run;

data outb;

set outb;

if _TYPE_='CORR' and _NAME_='probit_b';

py=probit_a;

keep py ;

run;

data both2;

merge temp1 temp2 outa outb; ;

if _n_=1 then set numcnt;

if _n_=1 then set numcas;

retain cs1;

retain cn1;

ncase=cs1;

ncntl=cn1;

covar=(probbnrm(prob_theta_1,prob_theta_2,(px+py)/2)-theta_1*theta_2

+(ncntl-1)*(probbnrm(prob_theta_1,prob_theta_2,py/2)-theta_1*theta_2)

+(ncase-1)*(probbnrm(prob_theta_1,prob_theta_2,px/2)-theta_1*theta_2))

/(ncase*ncntl);

var_theta_diff=(var_theta_1+var_theta_2)-2*covar;

se_theta_diff=sqrt(var_theta_diff);

wt_theta_diff=1.0/var_theta_diff;

theta_diff=theta_1-theta_2;

wttheta_1=1.0/var_theta_1;

wttheta_2=1.0/var_theta_2;

/* create variable for output */

theta1ot =compress(put(theta_1,6.3)||'+/-'||put(se_theta_1,6.3));

theta2ot =compress(put(theta_2,6.3)||'+/-'||put(se_theta_2,6.3));

thetadiffot =compress(put(theta_diff,6.3)||'+/-'||put(se_theta_diff,6.3));

run;

proc print label;

var ncase ncntl theta1ot theta2ot thetadiffot;

label ncase ='# of Cases'

ncntl ='# of Controls'

theta1ot ='Theta +/- se for model1'

theta2ot ='Theta +/- se for model2'

thetadiffot='Defference of Theta from 2 models';

title 'Both models';

run;

%end;

%mend WilGoodnessFit;

%macro EstTheta(dsn, mdlnum);

proc sort data=&dsn;

by &subgrp &outcome;

run;

proc freq data=&dsn noprint;

tables &outcome*&subgrp/out=ab;

run;

/* get total# of cases for each subgroup */

data numcas;

set ab;

if &outcome=1;

run;

data numcas;

set numcas end=eof;

retain cs1-cs&n 0;

array aa(s) cs1-cs&n;

s=base_agegrp;

aa=count;

drop &subgrp &outcome ;

if eof then output;

run;

/* get total# of controls for each subgroup */

data numcnt;

set ab end=eof;

if &outcome=0;

run;

data numcnt;

set numcnt end=eof;

retain cn1-cn&n 0;;

array aa(s) cn1-cn&n;

s=base_agegrp;

aa=count;

drop &subgrp &outcome ;

if eof then output;

run;

/* estimate theta */

proc rank data=&dsn out=rnkall&mdlnum(keep=&subgrp &id rscore_all&mdlnum &outcome);

by &subgrp;

var score&mdlnum;

ranks rscore_all&mdlnum;

run;

proc sort data=rnkall&mdlnum;

by &subgrp &id;

run;

data temp&mdlnum;

set rnkall&mdlnum end=eof;

by &subgrp;

if _n_=1 then do;

set numcnt;

set numcas;

end;

retain cs1-cs&n;

retain cn1-cn&n;

retain ratio_1 sumrnk 0;

array ba(s) cn1-cn&n;

array aa(s) cs1-cs&n;;

s=&subgrp;;

ncase=aa;

ncntl=ba;

if &outcome=1 then sumrnk=sumrnk+rscore_all&mdlnum;

avernk=(aa+ba+1)/2;

ratio=(rscore_all&mdlnum-avernk)**2/(aa+ba-1);

ratio_&mdlnum=ratio_&mdlnum+ratio;

if last.&subgrp then do;

theta_&mdlnum=(sumrnk-(ncase*(ncase+1)/2))/(ncase*ncntl);

prob_theta_&mdlnum=probit(theta_&mdlnum);

var_theta_&mdlnum=(theta_&mdlnum*(1-theta_&mdlnum)+(aa+ba-2)*

(probbnrm(prob_theta_&mdlnum,prob_theta_&mdlnum,0.5)-theta_&mdlnum**2))/(aa*ba);

se_theta_&mdlnum=sqrt(var_theta_&mdlnum);

wttheta_&mdlnum=1.0/var_theta_&mdlnum;

output;

ratio_&mdlnum=0;

sumrnk=0;

end;

keep &subgrp ncase ncntl theta_&mdlnum ratio_&mdlnum var_theta_&mdlnum

se_theta_&mdlnum wttheta_&mdlnum prob_theta_&mdlnum;

run;

%mend EstTheta;

%macro EstTheta1(dsn, mdlnum);

proc sort data=&dsn;

by &outcome;

run;

proc freq data=&dsn noprint;

tables &outcome/out=ab;

run;

/* get total# of cases for each subgroup */

data numcas;

set ab;

if &outcome=1;

cs1=count;

drop &outcome;

run;

data numcnt;

set ab ;

retain cn1 0;;

cn1=count;

drop &outcome ;

run;

/* estimate theta */

proc rank data=&dsn out=rnkall&mdlnum(keep= &id rscore_all&mdlnum &outcome);

var score&mdlnum;

ranks rscore_all&mdlnum;

run;

proc sort data=rnkall&mdlnum;

by &id;

run;

data temp&mdlnum;

set rnkall&mdlnum end=eof;

if _n_=1 then do;

set numcnt;

set numcas;

end;

retain cs1;

retain cn1;

retain ratio_1 sumrnk 0;

ncase=cs1;

ncntl=cn1;

if &outcome=1 then sumrnk=sumrnk+rscore_all&mdlnum;

avernk=(cs1+cn1+1)/2;

ratio=(rscore_all&mdlnum-avernk)**2/(cs1+cn1-1);

ratio_&mdlnum=ratio_&mdlnum+ratio;

if eof then do;

theta_&mdlnum=(sumrnk-(ncase*(ncase+1)/2))/(ncase*ncntl);

prob_theta_&mdlnum=probit(theta_&mdlnum);

var_theta_&mdlnum=(theta_&mdlnum*(1-theta_&mdlnum)+(cs1+cn1-2)*

(probbnrm(prob_theta_&mdlnum,prob_theta_&mdlnum,0.5)-theta_&mdlnum**2))/(cs1*cn1);

se_theta_&mdlnum=sqrt(var_theta_&mdlnum);

wttheta_&mdlnum=1.0/var_theta_&mdlnum;

output;

end;

keep ncase ncntl theta_&mdlnum ratio_&mdlnum var_theta_&mdlnum

se_theta_&mdlnum wttheta_&mdlnum prob_theta_&mdlnum;

run;

%mend EstTheta1;