/******************************************************************************
                    DOMINANCE PROBABILITY MACRO
                    
This macro executes dominance analysis as described by Razia Azen & David Budescu 
in Psychological Methods, 2003.  Dominance analysis quantifies the importance of 
each predictor as its average increment to the model r-squared, across all possible 
submodel sizes.
This macro was written by Razia Azen with valuable contributions from Robert 
Ceurvorst.  

NOTE:  This program is limited to at most 10 predictors!

-------------
INSTRUCTIONS:
-------------
1. In this macro, change the %macro dom line (which appears just below these 
instructions) according to the following directions:
%macro dom (data=_last_, dep=Y, indep='list of predictors', p=n_of_predictors,
      noprint=0, short=0, B=n_of_bootstraps, predtype='type_of_predictors', 
	  seed=random_number);
      
  If defaults are used, then only p= OR indep= is necessary, e.g., "%dom(p=4)"
  will use the last data set created and operate on variables Y and X1-X4.      

Either p= OR indep= is required -- not both.  If both are specified, p is 
determined by counting the variables in the indep= list.

data=      SAS dataset to be used.  Default is last dataset created.

dep=       Name of dependent variable.  Default is Y.

indep=     List of predictors in quotes.
 OR
p=         No. of predictors IF and ONLY IF they are named X1-Xp,
           in which case indep= is not required.

noprint=1  Suppresses printing of the input dataset.

short=1    Suppresses listing of each predictor's contributions to individual 
           submodels.

B=  	   The number of bootstrap runs.  Default is 1000.                  

predtype= 'f' if the predictors are fixed;     
          'r' if the predictors are random.  Default is 'r'.         

seed=     Six digits.  Default is 0 (uses clock value).    

For example, with 4 predictors named X1, X2, X3, X4 and all default values:
(data=_last_, dep=Y, p=4, indep=, noprint=0, short=0, B=1000, predtype='r', seed=0);
Or, with 4 predictors named A, B, C, D and all default values:
(data=_last_, dep=Y, p=, indep='A B C D', noprint=0, short=0, B=1000, predtype='r', seed=0);

2. Save this macro, and add the following two lines (below) to the
SAS program in which you've read the data set to be analyzed:

%include 'k:\sasmacro\uniDAmacro.sas';  *** CHANGE TO PATH WHERE MACRO IS SAVED ***;
%dom;

3.  Run the program edited in step 2.

-- END OF INSTRUCTIONS --
******************************************************************************/
option nosource;
/********** CHANGE ONLY THE LINE BELOW (See step 1 above)!!! **************/
%macro dom (data=_last_, dep=Y, p=, indep=' ', noprint=0, short=0, B=1000, predtype='r', seed=0);
/**************************************************************************/

%if &indep= and &p= %then %do;
  %put YOU MUST SPECIFY INDEP=list of predictors OR P=no. of predictors (IF THEY ARE NAMED X1-Xp).;
  %goto done;
%end;

%else %if &indep ne %then %do;
  %if %index(&indep,%str(%'))>0 or %index(&indep,%str(%"))>0 
      %then %let indep = %substr(&indep,2,%length(&indep)-2);
  %if %index(&indep,-) > 0 %then %expand;
  %let p=1; 
  %do %while(%length(%scan(&indep,&p))>0);
      %let x&p = %scan(&indep,&p);
      %let p = %eval(&p+1);
  %end;
  %let p = %eval(&p-1);
%end;

%else %if &p > 0 %then %do; 
  %let indep=;
  %do i=1 %to &p; 
      %let indep=&indep x&i;
      %let x&i=x&i;
  %end;  
%end;

%if &p>10 %then %do;
  %put  THE MAXIMUM NUMBER OF PREDICTORS IS 10.  YOU HAVE &P..;
  %goto done;
%end;

%LET DATANAME=&SYSLAST;
option nonotes;
DATA _NULL_;  FILE PRINT LINESLEFT=WITH;  CALL SYMPUT('WITH',WITH);
DATA _NULL_;
  FILE PRINT NOTITLES LINESLEFT=WITHOUT;
  CALL SYMPUT ('MTITLE',TRIM(LEFT(WITHOUT-&WITH+2)));
  CALL SYMPUT('NMODELS', 2**&p -1);
RUN;

data original; set &data;

%if &noprint=0 %then %do;
proc print;
title&mtitle 'Input data set (check to make sure it is correct)'; run;
%end;

data labels; set;  
  keep &indep; 
  length dvlabel $40;
  call label(&dep,dvlabel);
  call symput('dvl',trim(dvlabel));
  if _n_=1 then stop;  
run;   


/*********************************************************
     PART 1:  regression/DA of the original data 
*********************************************************/

title 'regression results for original data set';
option notes;
proc reg corr data=original outest=onereg (keep=_in_ _rsq_ &indep);
  model &dep=&indep / stb pcorr2 scorr2 ;
  model &dep=&indep / selection=adjrsq best=&nmodels %if &short=1 %then noprint;;
run; quit; option nonotes;


* order all subset models in lexicographical order;
data modelmat;
  set onereg; 
  %if %substr(&sysver,1,1)>7 %then if _n_>1;;
  %do i=1 %to &p;
    &&x&i=(&&x&i>.); 
  %end;
  if _IN_=. then delete;
  keep &indep _IN_ _RSQ_;
proc sort; by _IN_ %do i=1 %to &p; descending &&x&i %end;; run;
proc print; title 'R-squared values for all subset models'; run;

 /*** %if &short=0 %then %do;
   proc print; id _IN_ _RSQ_; format _RSQ_ 5.4; run;
 %end;  ***/


/**********************************************
          DOMINANCE ANALYSIS TABLE  
**********************************************/
option notes; title;
proc IML;
 reset noprint;
 start;
 * read the subset models matrix into DOM;
 use modelmat;
  read all into DOM;
 close modelmat;

* dom contains, for each subset model, 1/0 values for the predictors,
  the number of predictors in (_IN_), and the model r2 (_RSQ_); 

   p=ncol(dom)-2;
   ncol=ncol(dom);
   
* dom is rearranged to contain, for each subset model, the number of predictors
  in and the r2 of the model, followed by 1/0 values for predictors in/out;
  dom1=dom[,1:p];  dom2=dom[,(p+1):ncol];  dom=dom2||dom1;  free dom1 dom2; 

* generate table of additional contributions; 
   null=J(1,ncol,0);
   dom=null//dom;
   nrow=nrow(dom);
* dom now contains a top row of zeros for the null model;
   full=J(1,p,99);  fullrsq=J(1,1,99);
   reduced=J(1,p,99);  redrsq=J(1,1,99);
   contrib=J(nrow,p,0);  * additional contributions matrix;

    do i=1 to nrow;      * for each model;
     do j=1 to p;        * for each predictor;

       if dom(|i,j+2|)=0 then do; * if predictor is not in subset model;
        reduced=dom[i,3:ncol];    * the 1/0 row represents the reduced model;
        full=reduced;             * the full model is same as reduced model;

         do k=1 to p;          
          if k=j then full(|1,k|)=1;   * add the jth predictor to the full model;
         end;    

         do r=1 to nrow;               * for each model;
          comp=dom[r,3:ncol];          * comp is the 1/0 row of dom;
          if comp=full then fullrsq=dom[r,2];    * r2 of row is fullrsq, or; 
          if comp=reduced then redrsq=dom[r,2];  * r2 of row is redrsq;
         end;

         contrib(|i,j|)=fullrsq-redrsq;          * contrib is r2 difference;
       end;

       else do;
        contrib(|i,j|)=.;         * if predictor is in model, contib is .;
       end;

     end;
    end;

   contrib=dom||contrib;
   contrib=contrib[1:nrow-1,];

  cols = {IN RSQ %do i=1 %to &p; &&X&i %end; %do i=1 %to &p; CP&i %end;};

  create rsqtable from contrib[colname=cols];
   append from contrib;
  close rsqtable;

finish;
run; quit;

option nonotes;

%if &short=0 %then %do;
 proc print data=rsqtable;  id IN RSQ;  format RSQ CP1-CP&p 5.4; 
 title&mtitle 'Dominance Table:  Additional Contributions of Predictors Across All Subset Regression Models';
 %UNQUOTE(TITLE%EVAL(&MTITLE+1)) 'CPi indicates the additional contribution of predictor i to the model r-square';
%end;

proc summary nway; var CP1--CP&p; class in;
 output out=avgcont (drop=_type_) mean=&indep;
run;
proc means noprint; var &indep; 
 output out=meanc mean=;
run;
data avgcont; set labels meanc (in=y) avgcont;
  if y then in=999;
proc print double;  id IN;  var &indep;  
  format &indep 5.4  IN %if &p<11 %then 1.0; %else 2.0;;
title&mtitle 'Dominance Analysis:  Overall Average Contributions of Predictors (First Row)';
%UNQUOTE(TITLE%EVAL(&MTITLE+1)) 'And Average Contributions to Models of Each Size (Remaining Rows)'; run;

proc transpose prefix=size out=meanc (rename=(SIZE999=OVERALL _NAME_=VAR)); id IN; var &indep; run;
proc sort; by descending overall;
title&mtitle 'Dominance Analysis:  Average Predictor Contributions Overall and to Models of Each Size';
proc print; id _character_; format _numeric_ 5.4; run;


TITLE&MTITLE;
%DONE: option notes _last_=&syslast;
run;


/**********************************************
          DOMINANCE MATRICES (Sample)  
**********************************************/

Proc IML;
start;
use modelmat;
 read all into damat;
close modelmat;
Dcsample=J(&p,&p,0); Dasample=J(&p,&p,0); Dgsample=J(&p,&p,0);
RSQ=damat[,ncol(damat)];

/***************************************************/
/*****           Complete dominance		        ****/
/***************************************************/
do i=1 to &p-1;
 do j=i+1 to &p;

 * define matrix of constants ("comp") to determine complete dominance between
 each pair of predictors;

        comp=J(2**(&p-2),2**&p-1,0);

        * Xh is any subset model that does not include i and j;
        * Xh contains the columns of damat that do not involve i or j;
        Xh=J(nrow(damat),&p-2,99);
		Xhcol=0;			* index column number in Xh;
		do h=1 to &p;
		 if (i ^= h & j ^= h) then do;	* find non i,j columns;
		  Xhcol=Xhcol+1;				* update column number in Xh;
		  Xh[,Xhcol] = damat[,h];		* assign column to Xh;
		 end;
		end;

		* contrast rows (subsets) representing XiXh and XjXh;
        comprow=1;
        do r=1 to 2**&p-2;               * for each pair of rows;
         do s=r+1 to 2**&p-1;
          if Xh[r,]=Xh[s,] then do;     * if the rows of Xh are the same;
                                        * and if i and j are contrasted;
           if (damat[r,i]=1 & damat[r,j]=0 & damat[s,i]=0 &
                damat[s,j]=1) then do;
                 comp[comprow,r]=1;
                 comp[comprow,s]=-1;
                 comprow=comprow+1;
           end;
          end;

         end;   * do r loop;
        end;    * do s loop;


*** Determine the complete Dij value ***;

  cdiffij=comp*RSQ;

  zero=J(nrow(cdiffij), ncol(cdiffij),0);
  ijdom=nrow(cdiffij);

  * obtain complete dominance matrices;
     * undetermined case (all differences are zero);
     if cdiffij=zero then do;
        Dcsample[i,j]=0.5;
        Dcsample[j,i]=Dcsample[i,j];
     end;
     * else, check signs of difference elements;
     if cdiffij ^= zero then do;
      nonneg=0; nonpos=0;
      do k=1 to nrow(cdiffij);
       if cdiffij[k,]>=0 then nonneg=nonneg+1;
       if cdiffij[k,]<=0 then nonpos=nonpos+1;
      end;
      * dominance case;
      if nonneg=ijdom then Dcsample[i,j]=1; else Dcsample[i,j]=0;
      if nonpos=ijdom then Dcsample[j,i]=1; else Dcsample[j,i]=0;
      * undetermined case (differences have different signs);
      if (Dcsample[i,j]=0 & Dcsample[j,i]=0) then do;
        Dcsample[i,j]=0.5;
        Dcsample[j,i]=Dcsample[i,j];
      end;
     end;


/***************************************************/
/*****     Conditional dominance                ****/
/***************************************************/
* define matrix of constants ("avg") to determine average (within model
size) dominance between each pair of predictors;

subsets=damat;
* subsets is a matrix that consists of p+2 columns (and 
the null model in the top row): 1/0 for x1 - xp, in, Rsq;

  avg=J(&p,2**&p-1,0);
  do c=1 to &p;         * for each model size;
   num=0;               * index number of subsets per size;
   do m=1 to 2**&p-1;	* for each model, determine number of models
   						of size is c-1 to which the ith predictor makes
   						an additional contribution;
	if (subsets[m,&p+1]=c-1 & subsets[m,i]=0) then num=num+1;
   end;
   if num=0 then num=1;	* for the case of the null model (size=0);
   do r=1 to 2**&p-1;    * determine subsets to contrast;
    * consider additional contributions for model size (c);
    if (subsets[r,&p+1]=c-1 | subsets[r,&p+1]=c) then do;
     * average contribution of models that inlcude i but not j;
     if (subsets[r,i]=1 & subsets[r,j]=0) then avg[c,r]=1/num;
     * average contribution of models that inlcude j but not i;
     if (subsets[r,i]=0 & subsets[r,j]=1) then avg[c,r]=-1/num;
    end;
   end;
  end;

*** Determine the average Dij value ***;

  adiffij=avg*RSQ;

  zero=J(nrow(adiffij), ncol(adiffij),0);
  ijdom=nrow(adiffij);

  * obtain average dominance matrices;
     * undetermined case (all differences are zero);
     if adiffij=zero then do;
        Dasample[i,j]=0.5;
        Dasample[j,i]=Dasample[i,j];
     end;
     * else, check signs of difference elements;
     if adiffij ^= zero then do;
      nonneg=0; nonpos=0;
      do k=1 to nrow(adiffij);
       if adiffij[k,]>=0 then nonneg=nonneg+1;
       if adiffij[k,]<=0 then nonpos=nonpos+1;
      end;
      * dominance case;
      if nonneg=ijdom then Dasample[i,j]=1; else Dasample[i,j]=0;
      if nonpos=ijdom then Dasample[j,i]=1; else Dasample[j,i]=0;
      * undetermined case (differences have different signs);
      if (Dasample[i,j]=0 & Dasample[j,i]=0) then do;
        Dasample[i,j]=0.5;
        Dasample[j,i]=Dasample[i,j];
      end;
     end;


/***************************************************/
/*****         General dominance                ****/
/***************************************************/
* define matrix of constants ("glob") to determine global (overall average)
dominance between each pair of predictors;

  glob=J(1,2**&p-1,99);
  do g=1 to 2**&p-1;
   glob[,g]=avg[+,g]/&p;
  end;

*** Determine the global Dij value ***;

  gdiffij=glob*RSQ;

  zero=J(nrow(gdiffij), ncol(gdiffij),0);
  ijdom=nrow(gdiffij);

  * obtain global dominance matrices;
     * undetermined case (all differences are zero);
     if gdiffij=zero then do;
        Dgsample[i,j]=0.5;
        Dgsample[j,i]=Dgsample[i,j];
     end;
     * else, check signs of difference elements;
     if gdiffij ^= zero then do;
      nonneg=0; nonpos=0;
      do k=1 to nrow(gdiffij);
       if gdiffij[k,]>=0 then nonneg=nonneg+1;
       if gdiffij[k,]<=0 then nonpos=nonpos+1;
      end;
      * dominance case;
      if nonneg=ijdom then Dgsample[i,j]=1; else Dgsample[i,j]=0;
      if nonpos=ijdom then Dgsample[j,i]=1; else Dgsample[j,i]=0;
      * undetermined case (differences have different signs);
      if (Dgsample[i,j]=0 & Dgsample[j,i]=0) then do;
        Dgsample[i,j]=0.5;
        Dgsample[j,i]=Dgsample[i,j];
      end;
     end;


end; *(i loop);
end; *(j loop);


create Dcsample from Dcsample;
 append from Dcsample;
close Dcsample;
create Dasample from Dasample;
 append from Dasample;
close Dasample;
create Dgsample from Dgsample;
 append from Dgsample;
close Dgsample;

* create a file with sample dominance values;
* variables: i, j, cell value;
Sample_dc = J(&p*(&p-1), 3, 0); Sample_da = J(&p*(&p-1), 3, 0);
Sample_dg = J(&p*(&p-1), 3, 0); row=0;
do i=1 to nrow(Dcsample);
 do j=1 to ncol(Dcsample);
  if i ^= j then do;
   row=row+1;
   Sample_dc[row,1]=i; Sample_dc[row,2]=j; Sample_dc[row,3]=Dcsample[i,j]; 
   Sample_da[row,1]=i; Sample_da[row,2]=j; Sample_da[row,3]=Dasample[i,j]; 
   Sample_dg[row,1]=i; Sample_dg[row,2]=j; Sample_dg[row,3]=Dgsample[i,j]; 
  end;
 end;
end;
create sample_dc from Sample_dc[colname={i j dij}];
 append from Sample_dc;
close sample_dc;
create sample_da from Sample_da[colname={i j dij}];
 append from Sample_da;
close sample_da;
create sample_dg from Sample_dg[colname={i j dij}];
 append from Sample_dg;
close sample_dg;

finish;
run;

/*************************************************************************
* 		PART 2:  Bootstrapping Loop
**************************************************************************/
* input:  data set containing Y, data set containing X;
data depvar;
 set original;
 keep &dep;
run;

data indepvar;
 set original;
 keep &indep;
run;


* steps:  1. resample, 2. compute r-squared vector, 3. repeat B times;
option ls=70 nonotes nosource nosource2;
proc IML;
reset noprint; * nolog;
start;

*A. Generate a vector of B seeds (seedvec);
/***********************************************************************/
/* The vector seedvec contains B random seeds from a uniform           */
/* distribution, generated using the initial value (seed) specified    */
/* above.  The vector is then saved to an external file, seeds,        */
/* which is subsequently read into the SAS dataset (seeds).            */
/***********************************************************************/ 

seedvec=J(&B,1,0);
 do i=1 to &B;
   seedvec(|i,|) = uniform(&seed)*100000;
 end;

create seeds from seedvec;
 append from seedvec;
close seeds;

finish;
run; quit;


/*====================================================================*/
/*     start 1 to B loop                      			      */
/*====================================================================*/

%do r = 1 %to &B;

proc IML;
reset noprint; * nolog;
start;
* obtain Aij and Dij for each ij pair;

* read in the original data set (into DATA);
 use depvar;
  read all into YData;
 close depvar;
 use indepvar;
  read all into XData;
 close indepvar;
DATA = YData || XData;
Y = YData;
X = XData;

* read in the seeds vector (into seedvec);
use seeds;
 read all into seedvec;
close seeds;

 n=nrow(DATA);
 r=ncol(DATA);

*** check input;
if &r=1 then do;
 print 'Check that the data were read properly!!!';
 first5Y = J(1,5,0); first5Y=Y[1:5,];
 first5X = J(&p,5,0); first5X=X[1:5,];
 print 'The number of observations used is:' n;
 print 'The total number of variables used is:' r;
 print 'The dependent values for the first five cases are:' first5Y;
 print 'The predictor values for the first five cases are:' first5X;
 print 'You requested' &B 'bootstrap samples';
 print 'The bootstrap procedure is now running... PLEASE WAIT...';
end;


/***********************************************************************/
/* For bootstrapping run i, the i-th entry of the vector seedvec       */
/* (resample) is used to generate a vector (integers) whose entries    */
/* consist of the integers 1 to n sampled randomly, with replacement,  */
/* from a uniform distribution.                                        */
/***********************************************************************/      
 integers=J(n,1,0);
 
 * Specify seed for resampling (resample);
 resample=J(1,1,0);
 resample=seedvec(|&r,|);
 
 * generate a vector (integers) containing the integers 1 to n sampled
 randomly, with replacement, from a uniform distribution whose initial seed is
 resample;
 Do i=1 to n;
  integers(|i|)=int(uniform(resample)*(n-1)+1); 
 end; 
*print &r integers;

%if &predtype='r' %then %do;  
/*=====================================================================*/
/***                 PAIR RESAMPLING BOOTSTRAP                       ***/
/*=====================================================================*/
/***********************************************************************/
/* The i-th entry in integers is an integer (j) between 1 and n which  */
/* indicates the entry (row) number of the data matrix to be taken     */
/* as the i-th entry (row) in the matrix obsboot.  This is repeated    */
/* for i = 1 to n.  						       */
/***********************************************************************/    
 * obsboot is a matrix of randomly sampled observations;
 obsboot=J(n,r,0);
 Do i=1 to n;
  j=integers(|i, |);
  obsboot(|i, |)=DATA(|j, |); 
 end;

 * convert bootstrapped data to external file;
 ***************************************************;
create bootdata from obsboot[colname={&dep &indep}];
 append from obsboot;
close bootdata;


%end;
/*===================== END of Pair Resampling =======================*/



%if &predtype='f' %then %do; 
/*=====================================================================*/
/***                RESIDUAL RESAMPLING BOOTSTRAP                    ***/
/*=====================================================================*/
/***********************************************************************/
/* From the matrix DATA, whose first column represents Y and the       */
/* remaining columns represent X (as specified above), the vector of   */
/* Least-squares predicted values, YHat, and the vector of the         */
/* Least Squares residuals, Res, are computed.                         */
/***********************************************************************/

* fit a model with an intercept to obtain YHat and Res;
 ONE=J(n,1,1);
 X1=ONE||X;
 YHat=X1*INV(X1`*X1)*X1`*Y;
 Res=YHat-Y;


/***********************************************************************/
/* The i-th entry in integers is an integer between 1 and n (j) which  */
/* indicates the entry (row) number of the residuals vector, Res,      */
/* to be added to the i-th predicted value, Y-Hat.  This is repeated   */
/* for i = 1 to n.  The vector Yboot consists of the initial predicted */
/* values (YHat) with the randomly selected residuals added.  The      */
/* matrix Fixdata consists of Yboot and the original X matrix.         */
/***********************************************************************/      

 * Resboot is a vector of randomly sampled residuals;
 Resboot=J(n,1,0);
 Do i=1 to n;
  j=integers(|i, |);
  Resboot(|i, |)=Res(|j, |); 
 end;

 * add random residuals to Y-hat;
 Yboot=YHat+Resboot; 

 * an n by r matrix of bootstrapped Y and fixed Xs;
 Fixdata=Yboot||X;       

 * convert bootstrapped data to external file;
 ***************************************************;
create bootdata from Fixdata[colname={&dep &indep}];
 append from Fixdata;
close bootdata;

%end;
/*=================== END of Residual Resampling =====================*/

finish; 
run; quit;


* produce bootstrap data sets;


/* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
/* 		Computing Dij		               */
/* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/

* regression of bootstrapped data;
 ***********************************;

data resample;                             
set bootdata;
run;

proc reg corr data=resample noprint outest=regfile (keep=_in_ _rsq_ &indep);
  model &dep=&indep / selection=adjrsq;
run; quit; option nonotes;

* order all subset models in lexicographical order;
 data bootmat;
  set regfile;
  *%if %substr(&sysver,1,1)>7 %then if _n_>1;;
  %do i=1 %to &p;
    &&x&i=(&&x&i>.); 
  %end;
  if _IN_=. then delete;
  keep &indep _IN_ _RSQ_;
 run;
proc sort; by _IN_ %do i=1 %to &p; descending &&x&i %end;; run;


* obtain Aij and Dij for each ij pair;
proc IML;
reset noprint; * nolog;
start;
/*************************************************************************
* 		COMPUTE DOMINANCE MATRICES (resample)
**************************************************************************/
use bootmat;
 read all into damat;
close bootmat;
* damat is a matrix that consists of p+2 columns: 1/0 for x1 - xp, in, Rsq;

use Dcsample;
 read all into sampleDc;
close Dcsample;
use Dasample;
 read all into sampleDa;
close Dasample;
use Dgsample;
 read all into sampleDg;
close Dgsample;

* at first run, initialize frequency matrices and sample pattern counter;
if &r =1 then do;
 freqDc1=J(&p, &p, 0); freqDc0=J(&p, &p, 0); freqDcn=J(&p, &p, 0); 
 freqDa1=J(&p, &p, 0); freqDa0=J(&p, &p, 0); freqDan=J(&p, &p, 0);
 freqDg1=J(&p, &p, 0); freqDg0=J(&p, &p, 0); freqDgn=J(&p, &p, 0);
 patcount=J(1,3,0); *columns correspond to dominance types;
end;

* at subsequent runs retrieve files as sum matrices;
if &r ^= 1 then do;
use sumDc1;
 read all into freqDc1;
close sumDc1;
use sumDc0;
 read all into freqDc0;
close sumDc0;
use sumDcn;
 read all into freqDcn;
close sumDcn;

use sumDa1;
 read all into freqDa1;
close sumDa1;
use sumDa0;
 read all into freqDa0;
close sumDa0;
use sumDan;
 read all into freqDan;
close sumDan;

use sumDg1;
 read all into freqDg1;
close sumDg1;
use sumDg0;
 read all into freqDg0;
close sumDg0;
use sumDgn;
 read all into freqDgn;
close sumDgn;

use patfreq;
 read all into patcount;
close patfreq;

end;

* r-squared vector is the last column of damat (without null model);
RSQ=damat[,ncol(damat)];
*print &r RSQ;

Dc=J(&p,&p,0); Da=J(&p,&p,0); Dg=J(&p,&p,0);



/***************************************************/
/*****         Complete dominance		****/
/***************************************************/
do i=1 to &p-1;
 do j=i+1 to &p;

 * define matrix of constants ("comp") to determine complete dominance between
 each pair of predictors;

        comp=J(2**(&p-2),2**&p-1,0);

        * Xh is any subset model that does not include i and j;
        * Xh contains the columns of damat that do not involve i or j;
        Xh=J(nrow(damat),&p-2,99);
		Xhcol=0;			* index column number in Xh;
		do h=1 to &p;
		 if (i ^= h & j ^= h) then do;	* find non i,j columns;
		  Xhcol=Xhcol+1;				* update column number in Xh;
		  Xh[,Xhcol] = damat[,h];		* assign column to Xh;
		 end;
		end;

		* contrast rows (subsets) representing XiXh and XjXh;
        comprow=1;
        do r=1 to 2**&p-2;               * for each pair of rows;
         do s=r+1 to 2**&p-1;
          if Xh[r,]=Xh[s,] then do;     * if the rows of Xh are the same;
                                        * and if i and j are contrasted;
           if (damat[r,i]=1 & damat[r,j]=0 & damat[s,i]=0 &
                damat[s,j]=1) then do;
                 comp[comprow,r]=1;
                 comp[comprow,s]=-1;
                 comprow=comprow+1;
           end;
          end;

         end;   * do r loop;
        end;    * do s loop;


*** Determine the complete Dij value ***;

  cdiffij=comp*RSQ;

  zero=J(nrow(cdiffij), ncol(cdiffij),0);
  ijdom=nrow(cdiffij);

  * obtain complete dominance matrices;
     * undetermined case (all differences are zero);
     if cdiffij=zero then do;
        Dc[i,j]=0.5;
        Dc[j,i]=Dc[i,j];
     end;
     * else, check signs of difference elements;
     if cdiffij ^= zero then do;
      nonneg=0; nonpos=0;
      do k=1 to nrow(cdiffij);
       if cdiffij[k,]>=0 then nonneg=nonneg+1;
       if cdiffij[k,]<=0 then nonpos=nonpos+1;
      end;
      * dominance case;
      if nonneg=ijdom then Dc[i,j]=1; else Dc[i,j]=0;
      if nonpos=ijdom then Dc[j,i]=1; else Dc[j,i]=0;
      * undetermined case (differences have different signs);
      if (Dc[i,j]=0 & Dc[j,i]=0) then do;
        Dc[i,j]=0.5;
        Dc[j,i]=Dc[i,j];
      end;
     end;



/***************************************************/
/*****         Conditional dominance            ****/
/***************************************************/
* define matrix of constants ("avg") to determine average (within model
size) dominance between each pair of predictors;

subsets=damat;
* subsets is a matrix that consists of p+2 columns (and 
the null model in the top row): 1/0 for x1 - xp, in, Rsq;

  avg=J(&p,2**&p-1,0);
  do c=1 to &p;         * for each model size;
   num=0;               * index number of subsets per size;
   do m=1 to 2**&p-1;	* for each model, determine number of models
   						of size is c-1 to which the ith predictor makes
   						an additional contribution;
	if (subsets[m,&p+1]=c-1 & subsets[m,i]=0) then num=num+1;
   end;
   if num=0 then num=1;	* for the case of the null model (size=0);
   do r=1 to 2**&p-1;    * determine subsets to contrast;
    * consider additional contributions for model size (c);
    if (subsets[r,&p+1]=c-1 | subsets[r,&p+1]=c) then do;
     * average contribution of models that inlcude i but not j;
     if (subsets[r,i]=1 & subsets[r,j]=0) then avg[c,r]=1/num;
     * average contribution of models that inlcude j but not i;
     if (subsets[r,i]=0 & subsets[r,j]=1) then avg[c,r]=-1/num;
    end;
   end;
  end;

*** Determine the average Dij value ***;

  adiffij=avg*RSQ;

  zero=J(nrow(adiffij), ncol(adiffij),0);
  ijdom=nrow(adiffij);

  * obtain average dominance matrices;
     * undetermined case (all differences are zero);
     if adiffij=zero then do;
        Da[i,j]=0.5;
        Da[j,i]=Da[i,j];
     end;
     * else, check signs of difference elements;
     if adiffij ^= zero then do;
      nonneg=0; nonpos=0;
      do k=1 to nrow(adiffij);
       if adiffij[k,]>=0 then nonneg=nonneg+1;
       if adiffij[k,]<=0 then nonpos=nonpos+1;
      end;
      * dominance case;
      if nonneg=ijdom then Da[i,j]=1; else Da[i,j]=0;
      if nonpos=ijdom then Da[j,i]=1; else Da[j,i]=0;
      * undetermined case (differences have different signs);
      if (Da[i,j]=0 & Da[j,i]=0) then do;
        Da[i,j]=0.5;
        Da[j,i]=Da[i,j];
      end;
     end;


/***************************************************/
/*****         General dominance                ****/
/***************************************************/
* define matrix of constants ("glob") to determine global (overall average)
dominance between each pair of predictors;

  glob=J(1,2**&p-1,99);
  do g=1 to 2**&p-1;
   glob[,g]=avg[+,g]/&p;
  end;

*** Determine the global Dij value ***;

  gdiffij=glob*RSQ;
  *print 'global dominance comparisons';
  *print i j gdiffij;

  zero=J(nrow(gdiffij), ncol(gdiffij),0);
  ijdom=nrow(gdiffij);

  * obtain global dominance matrices;
     * undetermined case (all differences are zero);
     if gdiffij=zero then do;
        Dg[i,j]=0.5;
        Dg[j,i]=Dg[i,j];
     end;
     * else, check signs of difference elements;
     if gdiffij ^= zero then do;
      nonneg=0; nonpos=0;
      do k=1 to nrow(gdiffij);
       if gdiffij[k,]>=0 then nonneg=nonneg+1;
       if gdiffij[k,]<=0 then nonpos=nonpos+1;
      end;
      * dominance case;
      if nonneg=ijdom then Dg[i,j]=1; else Dg[i,j]=0;
      if nonpos=ijdom then Dg[j,i]=1; else Dg[j,i]=0;
      * undetermined case (differences have different signs);
      if (Dg[i,j]=0 & Dg[j,i]=0) then do;
        Dg[i,j]=0.5;
        Dg[j,i]=Dg[i,j];
      end;
     end;


end; *(i loop);
end; *(j loop);



**** compare current D matrices to sample D matrices ****;
if Dc=sampleDc then patcount[,1]=patcount[,1]+1;
if Da=sampleDa then patcount[,2]=patcount[,2]+1;
if Dg=sampleDg then patcount[,3]=patcount[,3]+1;
create patfreq from patcount;
 append from patcount;
close patfreq;


thisDc1=J(&p,&p,0); thisDc0=J(&p,&p,0); thisDcn=J(&p,&p,0); 
thisDa1=J(&p,&p,0); thisDa0=J(&p,&p,0); thisDan=J(&p,&p,0); 
thisDg1=J(&p,&p,0); thisDg0=J(&p,&p,0); thisDgn=J(&p,&p,0); 
do i=1 to &p;
 do j=1 to &p;
  if i ^=j then do;
   if Dc[i,j]=1 then thisDc1[i,j]=1;
   if Dc[i,j]=0 then thisDc0[i,j]=1;
   if Dc[i,j]=0.5 then thisDcn[i,j]=1;
   if Da[i,j]=1 then thisDa1[i,j]=1;
   if Da[i,j]=0 then thisDa0[i,j]=1;
   if Da[i,j]=0.5 then thisDan[i,j]=1;
   if Dg[i,j]=1 then thisDg1[i,j]=1;
   if Dg[i,j]=0 then thisDg0[i,j]=1;
   if Dg[i,j]=0.5 then thisDgn[i,j]=1;
  end;
 end;
end;

**** add current D matrices to previous D matrices ****;
freqDc1 = freqDc1+thisDc1; 
freqDa1 = freqDa1+thisDa1; freqDg1 = freqDg1+thisDg1;
freqDc0 = freqDc0+thisDc0; 
freqDa0 = freqDa0+thisDa0; freqDg0 = freqDg0+thisDg0;
freqDcn = freqDcn+thisDcn; 
freqDan = freqDan+thisDan; freqDgn = freqDgn+thisDgn;

create sumDc1 from freqDc1;
 append from freqDc1;
close sumDc1;
create sumDc0 from freqDc0;
 append from freqDc0;
close sumDc0;
create sumDcn from freqDcn;
 append from freqDcn;
close sumDcn;

create sumDa1 from freqDa1;
 append from freqDa1;
close sumDa1;
create sumDa0 from freqDa0;
 append from freqDa0;
close sumDa0;
create sumDan from freqDan;
 append from freqDan;
close sumDan;

create sumDg1 from freqDg1;
 append from freqDg1;
close sumDg1;
create sumDg0 from freqDg0;
 append from freqDg0;
close sumDg0;
create sumDgn from freqDgn;
 append from freqDgn;
close sumDgn;

*at last run create summary tables containing the following columns:
i, j, f=1, f=0, f=0.5 pij pji;
if &r=&B then do;
 row=1;
 summary_c=J(&p*(&p-1), 8, -9);
 summary_a=J(&p*(&p-1), 8, -9);
 summary_g=J(&p*(&p-1), 8, -9);
 do i=1 to &p;
  do j=1 to &p;
   if i^=j then do;
    summary_c[row,1]=i; summary_c[row,2]=j; 
	summary_c[row,3]=freqDc1[i,j]; 
	summary_c[row,4]=freqDc0[i,j]; summary_c[row,5]=freqDcn[i,j]; 
	summary_c[row,6]=freqDc1[i,j]/&B; summary_c[row,7]=freqDc0[i,j]/&B; 
    summary_c[row,8]=freqDcn[i,j]/&B;   
    summary_a[row,1]=i; summary_a[row,2]=j; 
	summary_a[row,3]=freqDa1[i,j]; 
	summary_a[row,4]=freqDa0[i,j]; summary_a[row,5]=freqDan[i,j];  
	summary_a[row,6]=freqDa1[i,j]/&B; summary_a[row,7]=freqDa0[i,j]/&B;  
    summary_a[row,8]=freqDan[i,j]/&B;   
    summary_g[row,1]=i; summary_g[row,2]=j; 
	summary_g[row,3]=freqDg1[i,j]; 
	summary_g[row,4]=freqDg0[i,j]; summary_g[row,5]=freqDgn[i,j]; 
	summary_g[row,6]=freqDg1[i,j]/&B; summary_g[row,7]=freqDg0[i,j]/&B;  
    summary_g[row,8]=freqDgn[i,j]/&B;   
    row=row+1;
   end;
  end;
 end;
 create csummary from summary_c[colname={i j f1 f0 fn Pij Pji pno}];
  append from summary_c;
 close csummary;
 create asummary from summary_a[colname={i j f1 f0 fn Pij Pji pno}];
  append from summary_a;
 close asummary;
 create gsummary from summary_g[colname={i j f1 f0 fn Pij Pji pno}];
  append from summary_g;
 close gsummary;
 create patsum from patcount[colname={complete conditional general}];
  append from patcount;
 close patsum;
end;

finish;
run; quit;

%end; ***** end the 1 to B loop ****;


/*************************************************************************/
/* 		PART 3:  Probabilities and reproducibilities					 */
/*************************************************************************/
* input:  nine p by p matrices of COUNTS;
* stpes:  convert to probabilities, compute test statistics / CIs, plots;
* output:  tables;
data average;
 set asummary;
 dominance='conditional';
 pair=i*10+j;
run; 
data complete;
 set csummary;
 dominance='complete';
 pair=i*10+j;
run; 
data global;
 set gsummary;
 dominance='general';
 pair=i*10+j;
run; 

data merged;
 set complete average global;
run;
/* proc print data=merged; title 'results'; run; */
data results; set merged;
 array f[3] f1 f0 fn;
  do domtype=1 to 3;
   domfreq = f[domtype];
   output;
  end;
run;
data results; set results;
  if domtype=1 then Dij=1;
  if domtype=2 then Dij=0;
  if domtype=3 then Dij=0.5;
run;
proc sort data=results; by dominance pair; run;
proc means data=results noprint;
 var Dij; freq domfreq;
 by dominance pair;
 output out=meansout mean=Dbar STD=SE;
run;



data samplea;
 set Sample_da; pair=i*10+j; dominance='conditional';
run;
data samplec;
 set Sample_dc; pair=i*10+j; dominance='complete';
run;
data sampleg;
 set Sample_dg; pair=i*10+j; dominance='general';
run;
data sample;
 set samplec samplea sampleg;
run;


******* SUMMARY OUTPUT *******;
* variables: dominance-type, i, j, p(iDj), p(jDi), p(noD), Dbar, SE, 
sample-value, pattern-prob;
*** merge sample, results and meansout data sets by dominance and pair;
proc sort data=sample; by dominance pair; run;
data results; set results; if domtype=1; run;
proc sort data=results; by dominance pair; run;
proc sort data=meansout; by dominance pair; run;
data printout;
 merge results sample meansout;
  by dominance pair;
run;

data table;
 set printout(drop=domtype domfreq _TYPE_ _FREQ_);
/* i_dom_j=pij; j_dom_i=pji; no_dom=pno; */
 Pijno=pno;
 Dij_mean = dbar;
 Dij_se = se;
 if dij=1 then reprod = pij;
 if dij=0 then reprod = pji;
 if dij=0.5 then reprod = pijno;
 format Dij_se F6.3;
run;
data table; set table(drop = f1 f0 fn pno dbar); run;
proc sort; by dominance pair; run;
proc print data=table; 
 title 'summary of results: all pairs'; 
 title2 'Pij, Pji, Pijno are dominance probabilities and';
 title3 'reprod are reproducibility values';
 var dominance i j Dij Dij_mean dij_SE Pij Pji Pijno reprod;
run;
data plots; set table;
 if Pij ge Pji;
 format Dij_se F6.3;
run;
proc print data=plots; 
 title 'summary of results: pair arranged by dominant predictor'; 
 title2 'Pij, Pji, Pijno are dominance probabilities and';
 title3 'reprod are reproducibility values';
 var dominance i j Dij Dij_mean dij_SE Pij Pji Pijno reprod;
run;

%mend dom;



/***********************************************************
   The following macros expand a variable list containing
   hyphens into a list specifying each individual variable. 
***********************************************************/

%MACRO EXPAND;
  %LET LNGTH = %LENGTH(&INDEP);  %LET TEMP=;
  %DO _INDEX_ = 1 %TO &LNGTH;
    %LET ITEM = %SCAN(&INDEP,&_INDEX_,%QUOTE( ));
    %IF %LENGTH(&ITEM) EQ 0 %THEN %GOTO DONE;
    %IF %INDEX(&ITEM,-) > 0 %THEN %EXPANDED;
    %LET TEMP = &TEMP &ITEM;
  %END;  %DONE:  %LET INDEP = &TEMP;
%MEND EXPAND;

%MACRO EXPANDED;
  %LET DASH = %INDEX(&ITEM,-);
  %DO I = %EVAL(&DASH-1) %TO 1 %BY -1;
    %LET ALPHANUM = %SUBSTR(&ITEM,&I,1);
    %DO II = 0 %TO 9;  %IF &ALPHANUM EQ &II %THEN %GOTO FOUND;  %END;
    %GOTO DONE;
  %FOUND:  %END;
  %DONE:  %LET PREFX = %SUBSTR(&ITEM,1,&I);
  %LET LOWER = %SUBSTR(&ITEM,%EVAL(&I+1),%EVAL(&DASH-&I-1));
  %LET UPPER = %SUBSTR(&ITEM,%EVAL(&DASH+&I+1));
  %LET ITEM=; %DO II = &LOWER %TO &UPPER; %LET ITEM = &ITEM &PREFX.&II;
  %END;
%MEND EXPANDED;