WilcxGoodnessFit.sas
/******************************************************************
* Program Name: WilGoodnessFit.sas *
* Program Dir : /udd/nhron/macrodevelopment/ *
* Program Date: Jan. 2008 *
* *
* Macro Variables: *
* *
* maindsn : data that contains id,subgrp,outcome, *
* score1 from model1, score2 from model2 *
* outcome: outcome 1: cases; 0: controls *
* subgrp : subgrp coding should be 1, 2 and etc. *
* id : id *
* *
* Note1: This program is to use the Mann-Whitney *
* U statistic approach to compare the C *
* statistic for *
* two different risk prediction models on the *
* the same dataset *
* Note2: *
* Normality of the risk score distribution *
* is not required to use this method *
* Based on paper "Power and sample size estimation *
* for the Wilcoxon Rank Sum Test" *
* by B. Rosner and R. Glynn *
* Programmed by: Rong Chen and Marion McPhee *
****************************************************************/
%macro WilGoodnessFit(maindsn=,
outcome=,
subgrp=,
id=);
%if &subgrp ne %then %do;
proc sql;
select count(distinct(&subgrp)) into: s
from &maindsn;
quit;
data _null_;
call symput('n', trim(left(&s)));
run;
%EstTheta(dsn=&maindsn, mdlnum=1);
%EstTheta(dsn=&maindsn, mdlnum=2);
data both;
merge rnkall1 rnkall2 end=eof;
by &subgrp &id;
if _n_=1 then set numcnt;
if _n_=1 then set numcas;
retain cs1-cs&n;
retain cn1-cn&n;
array ba(s) cn1-cn&n;
array aa(s) cs1-cs&n;
s=&subgrp;
ncase=aa;
ncntl=ba;
probit_a=probit(rscore_all1/(ncase+ncntl+1));
probit_b=probit(rscore_all2/(ncase+ncntl+1));
run;
data case;
set both;
if &outcome=1;
run;
proc sort; by &subgrp;
proc corr outp=outa; by &subgrp;
var probit_a probit_b;
run;
data control;
set both;
if &outcome=0;
run;
proc sort; by &subgrp;
proc corr outp=outb; by &subgrp;
var probit_a probit_b;
run;
data outa;
set outa;
if _TYPE_='CORR' and _NAME_='probit_b';
px=probit_a;
keep px &subgrp;
run;
data outb;
set outb;
if _TYPE_='CORR' and _NAME_='probit_b';
py=probit_a;
keep py &subgrp;
run;
data both2;
merge temp1 temp2 outa outb; by &subgrp;
if _n_=1 then set numcnt;
if _n_=1 then set numcas;
retain cs1-cs&n;
retain cn1-cn&n;
array ba(s) cn1-cn&n;
array aa(s) cs1-cs&n;
s=&subgrp;
ncase=aa;
ncntl=ba;
covar=(probbnrm(prob_theta_1,prob_theta_2,(px+py)/2)-theta_1*theta_2
+(ncntl-1)*(probbnrm(prob_theta_1,prob_theta_2,py/2)-theta_1*theta_2)
+(ncase-1)*(probbnrm(prob_theta_1,prob_theta_2,px/2)-theta_1*theta_2))
/(ncase*ncntl);
var_theta_diff=(var_theta_1+var_theta_2)-2*covar;
se_theta_diff=sqrt(var_theta_diff);
wt_theta_diff=1.0/var_theta_diff;
theta_diff=theta_1-theta_2;
wttheta_1=1.0/var_theta_1;
wttheta_2=1.0/var_theta_2;
/* create variable for output */
theta1ot =compress(put(theta_1,6.3)||'+/-'||put(se_theta_1,6.3));
theta2ot =compress(put(theta_2,6.3)||'+/-'||put(se_theta_2,6.3));
thetadiffot =compress(put(theta_diff,6.3)||'+/-'||put(se_theta_diff,6.3));
run;
proc print label;
var &subgrp ncase ncntl theta1ot theta2ot thetadiffot;
label ncase ='# of Cases'
ncntl ='# of Controls'
theta1ot ='Theta +/- se for model1'
theta2ot ='Theta +/- se for model2'
thetadiffot='Defference of Theta from 2 models';
title 'Both models';
run;
/* compute the test statistics */
data allages;
set both2 end=eof;
retain suma sumb sumc sumd sume sumf sncase sncntl 0;
if theta_1 ne . then suma=suma+wttheta_1*theta_1;
if wttheta_1 ne . then sumb=sumb+wttheta_1;
if theta_2 ne . then sumc=sumc+wttheta_2*theta_2;
if wttheta_2 ne . then sumd=sumd+wttheta_2;
if wt_theta_diff ne . then sume=sume+theta_diff*wt_theta_diff;
if wt_theta_diff ne . then sumf=sumf+wt_theta_diff;
if ncase ne . then sncase=sncase+ncase;
sncntl=sncntl+ncntl;
if eof then do;
theta1_summary=suma/sumb;
se_theta1_summary=sqrt(1.0/sumb);
theta2_summary=sumc/sumd;
se_theta2_summary=sqrt(1.0/sumd);
wt_theta1_summary=sumb;
wt_theta2_summary=sumd;
theta_diff_summary=sume/sumf;
se_theta_summary=sqrt(1.0/sumf);
wt_theta_summary=sumf;
tncase=sncase;
tncntl=sncntl;
z=theta_diff_summary/se_theta_summary;
pvalue=2*(1.0-probnorm(abs(z)));
drop suma sumb sumc sumd sume sumf;
output;
suma=0;
sumb=0;
sumc=0;
sumd=0;
sume=0;
sumf=0;
sncase=0;
sncntl=0;
end;
run;
data allages;
set allages;
theta1ots =put(round(theta1_summary, 0.001),6.3)||'+/-'||put(round(se_theta1_summary,0.001),6.3);
theta2ots =put(round(theta2_summary, 0.001),6.3)||'+/-'||put(round(se_theta2_summary,0.001),6.3);
thetadiffots =put(round(theta_diff_summary,0.001),6.3)||'+/-'||put(round(se_theta_summary,0.001),6.3);
run;
proc print data=allages label;
var tncase tncntl
theta1ots theta2ots thetadiffots pvalue;
label tncase ='# of Cases Summary'
tncntl ='# of Controls Summary'
theta1ots ='Theta +/- se for model1 Summary'
theta2ots ='Theta +/- se for model2 Summary'
thetadiffots='Defference of Theta from 2 models Summary';
run;
%end;
%else %if &subgrp=%str() %then %do;
%EstTheta1(dsn=&maindsn, mdlnum=1);
%EstTheta1(dsn=&maindsn, mdlnum=2);
data both;
merge rnkall1 rnkall2 end=eof;
by &id;
if _n_=1 then set numcnt;
if _n_=1 then set numcas;
retain cs1;
retain cn1;
ncase=cs1;
ncntl=cn1;
probit_a=probit(rscore_all1/(ncase+ncntl+1));
probit_b=probit(rscore_all2/(ncase+ncntl+1));
run;
data case;
set both;
if &outcome=1;
run;
proc corr outp=outa;
var probit_a probit_b;
run;
data control;
set both;
if &outcome=0;
run;
proc corr outp=outb; by &subgrp;
var probit_a probit_b;
run;
data outa;
set outa;
if _TYPE_='CORR' and _NAME_='probit_b';
px=probit_a;
keep px ;
run;
data outb;
set outb;
if _TYPE_='CORR' and _NAME_='probit_b';
py=probit_a;
keep py ;
run;
data both2;
merge temp1 temp2 outa outb; ;
if _n_=1 then set numcnt;
if _n_=1 then set numcas;
retain cs1;
retain cn1;
ncase=cs1;
ncntl=cn1;
covar=(probbnrm(prob_theta_1,prob_theta_2,(px+py)/2)-theta_1*theta_2
+(ncntl-1)*(probbnrm(prob_theta_1,prob_theta_2,py/2)-theta_1*theta_2)
+(ncase-1)*(probbnrm(prob_theta_1,prob_theta_2,px/2)-theta_1*theta_2))
/(ncase*ncntl);
var_theta_diff=(var_theta_1+var_theta_2)-2*covar;
se_theta_diff=sqrt(var_theta_diff);
wt_theta_diff=1.0/var_theta_diff;
theta_diff=theta_1-theta_2;
wttheta_1=1.0/var_theta_1;
wttheta_2=1.0/var_theta_2;
/* create variable for output */
theta1ot =compress(put(theta_1,6.3)||'+/-'||put(se_theta_1,6.3));
theta2ot =compress(put(theta_2,6.3)||'+/-'||put(se_theta_2,6.3));
thetadiffot =compress(put(theta_diff,6.3)||'+/-'||put(se_theta_diff,6.3));
run;
proc print label;
var ncase ncntl theta1ot theta2ot thetadiffot;
label ncase ='# of Cases'
ncntl ='# of Controls'
theta1ot ='Theta +/- se for model1'
theta2ot ='Theta +/- se for model2'
thetadiffot='Defference of Theta from 2 models';
title 'Both models';
run;
%end;
%mend WilGoodnessFit;
%macro EstTheta(dsn, mdlnum);
proc sort data=&dsn;
by &subgrp &outcome;
run;
proc freq data=&dsn noprint;
tables &outcome*&subgrp/out=ab;
run;
/* get total# of cases for each subgroup */
data numcas;
set ab;
if &outcome=1;
run;
data numcas;
set numcas end=eof;
retain cs1-cs&n 0;
array aa(s) cs1-cs&n;
s=base_agegrp;
aa=count;
drop &subgrp &outcome ;
if eof then output;
run;
/* get total# of controls for each subgroup */
data numcnt;
set ab end=eof;
if &outcome=0;
run;
data numcnt;
set numcnt end=eof;
retain cn1-cn&n 0;;
array aa(s) cn1-cn&n;
s=base_agegrp;
aa=count;
drop &subgrp &outcome ;
if eof then output;
run;
/* estimate theta */
proc rank data=&dsn out=rnkall&mdlnum(keep=&subgrp &id rscore_all&mdlnum &outcome);
by &subgrp;
var score&mdlnum;
ranks rscore_all&mdlnum;
run;
proc sort data=rnkall&mdlnum;
by &subgrp &id;
run;
data temp&mdlnum;
set rnkall&mdlnum end=eof;
by &subgrp;
if _n_=1 then do;
set numcnt;
set numcas;
end;
retain cs1-cs&n;
retain cn1-cn&n;
retain ratio_1 sumrnk 0;
array ba(s) cn1-cn&n;
array aa(s) cs1-cs&n;;
s=&subgrp;;
ncase=aa;
ncntl=ba;
if &outcome=1 then sumrnk=sumrnk+rscore_all&mdlnum;
avernk=(aa+ba+1)/2;
ratio=(rscore_all&mdlnum-avernk)**2/(aa+ba-1);
ratio_&mdlnum=ratio_&mdlnum+ratio;
if last.&subgrp then do;
theta_&mdlnum=(sumrnk-(ncase*(ncase+1)/2))/(ncase*ncntl);
prob_theta_&mdlnum=probit(theta_&mdlnum);
var_theta_&mdlnum=(theta_&mdlnum*(1-theta_&mdlnum)+(aa+ba-2)*
(probbnrm(prob_theta_&mdlnum,prob_theta_&mdlnum,0.5)-theta_&mdlnum**2))/(aa*ba);
se_theta_&mdlnum=sqrt(var_theta_&mdlnum);
wttheta_&mdlnum=1.0/var_theta_&mdlnum;
output;
ratio_&mdlnum=0;
sumrnk=0;
end;
keep &subgrp ncase ncntl theta_&mdlnum ratio_&mdlnum var_theta_&mdlnum
se_theta_&mdlnum wttheta_&mdlnum prob_theta_&mdlnum;
run;
%mend EstTheta;
%macro EstTheta1(dsn, mdlnum);
proc sort data=&dsn;
by &outcome;
run;
proc freq data=&dsn noprint;
tables &outcome/out=ab;
run;
/* get total# of cases for each subgroup */
data numcas;
set ab;
if &outcome=1;
cs1=count;
drop &outcome;
run;
data numcnt;
set ab ;
retain cn1 0;;
cn1=count;
drop &outcome ;
run;
/* estimate theta */
proc rank data=&dsn out=rnkall&mdlnum(keep= &id rscore_all&mdlnum &outcome);
var score&mdlnum;
ranks rscore_all&mdlnum;
run;
proc sort data=rnkall&mdlnum;
by &id;
run;
data temp&mdlnum;
set rnkall&mdlnum end=eof;
if _n_=1 then do;
set numcnt;
set numcas;
end;
retain cs1;
retain cn1;
retain ratio_1 sumrnk 0;
ncase=cs1;
ncntl=cn1;
if &outcome=1 then sumrnk=sumrnk+rscore_all&mdlnum;
avernk=(cs1+cn1+1)/2;
ratio=(rscore_all&mdlnum-avernk)**2/(cs1+cn1-1);
ratio_&mdlnum=ratio_&mdlnum+ratio;
if eof then do;
theta_&mdlnum=(sumrnk-(ncase*(ncase+1)/2))/(ncase*ncntl);
prob_theta_&mdlnum=probit(theta_&mdlnum);
var_theta_&mdlnum=(theta_&mdlnum*(1-theta_&mdlnum)+(cs1+cn1-2)*
(probbnrm(prob_theta_&mdlnum,prob_theta_&mdlnum,0.5)-theta_&mdlnum**2))/(cs1*cn1);
se_theta_&mdlnum=sqrt(var_theta_&mdlnum);
wttheta_&mdlnum=1.0/var_theta_&mdlnum;
output;
end;
keep ncase ncntl theta_&mdlnum ratio_&mdlnum var_theta_&mdlnum
se_theta_&mdlnum wttheta_&mdlnum prob_theta_&mdlnum;
run;
%mend EstTheta1;