* Input raw data (with missing values -99) from an external file; data raw; filename data 'Z:\zzy\research\Robust SEM\Missing data with Ke-Hai\mardiaMV25.dat.txt'; *need to be modified; *filename data 'd:\missingdata-robustSEM\mardiaMV25_contaminated.dat'; *need to be modified; infile data; input v1 v2 v3 v4 v5; *need to be modified; run; *-----------------------------------------------------------------------*; *-----------------------------------------------------------------------*; ** Missing data are on the last variable; ** The first stage is Huber-weight using an extended EM-algorithm; ** The second stage analysis is to minimizing the normal-distribution-based discrepancy function F_ML; ** Rescaled or adjusted statistics is used for inference; *the asymptotic cov is given by sandwich type matrix; ** as well as the inverse of the information matrix; options ls=150; *options nodate nonumber; title; proc iml; reset noname; *-----------------------------------------------------------------------*; *misinfo=j(m,p+2,0), m is the number of missing patterns; *misinfo[i,1]=number of cases in pattern i; *misinfo[i,2]=number of observed variables in pattern i; *misinfo[i,3:p+2] contains a permutation of {1,2,3,...,p}, the first part corresponding to observed variables, the remaining corresponding to missing variables in pattern i; start pattern(n,p,x,misinfo); *-------------------------------------------------------------------------; *use 2^j to order missing data patterns by SAS sort subroutine; misorder=j(n,1,0); do i=1 to n; misorderj=0; do j=1 to p; xij=x[i,j]; if xij=-99 then do; misorderj=misorderj+2**(j-1);*this gives each missing pattern a unique number; end; end; misorder[i]=misorderj; end; x=x||misorder; call sort( x, p+1);*sort the rows of x according to the (p+1)th column from small to large; *-------------------------------------------------------------------------; print x; misn=x[,p+1]; x=x[,1:p]; *--------------------------------------------------------------------------; *identifying the subscripts of missing variables and put them in misinfo; mi=0; nmi=0; oi=0; noi=0; do j=1 to p; if x[1,j]=-99 then do; mi=mi||j; *recording the missing variable subscript in the first case; nmi=nmi+1; *number of missing values in the first case; end; else do; oi=oi||j;*recording the observed variable subscript in the first case; noi=noi+1; end; end; oi=(oi[2:noi+1])`; if nmi=0 then do; misinfo_0=noi||oi; end; else do; mi=(mi[2:nmi+1])`; misinfo_0=noi||oi||mi; *recording the # observed variables, locations; end; *--------------------------------------------------------------------------; patnobs=0; *# of observed cases in a pattern; totpat=1; ncount=1; t1=misn[1]; do i=2 to n; if misn[i]=t1 then do; ncount=ncount+1; end; else do; patnobs=patnobs//ncount; t1=misn[i]; ncount=1; totpat=totpat+1; mi=0; nmi=0; oi=0; noi=0; do j=1 to p; if x[i,j]=-99 then do; mi=mi||j; nmi=nmi+1; end; else do; oi=oi||j; noi=noi+1; end; end; oi=(oi[2:noi+1])`; mi=(mi[2:nmi+1])`; misinfo_0=misinfo_0//(noi||oi||mi); end; end; patnobs=patnobs//ncount; patnobs=patnobs[2:totpat+1]; misinfo=patnobs||misinfo_0; print misinfo; finish; *-----------------------------------------------------------------------*; ** EM algorithm for unstructured (mu,sigma) starts here; start emmusig(x,misinfo,varphi,err,mu1,sig1); ep=.000000000001; n=nrow(x); p=ncol(x); mu0=j(p,1,0);*starting value; sig0=i(p);*starting value; n_it=0; **control iterations within 200; err=0; dt=1; prob=1-varphi; chip=cinv(prob,p); ck=sqrt(chip); cbeta=( p*probchi(chip,p+2)+ chip*(1-prob) )/p; print cbeta; do until (dt200 then do; err=1; goto label2; end; end; print mu1; print sig1; label2: finish; *-----------------------------------------------------------------------*; **** The following subroutine is to perform vech(.) function; *-----------------------------------------------------------------------*; start vech(A,Va); l=0; p=nrow(A); pstar=p*(p+1)/2; Va=j(pstar,1,0); do i=1 to p by 1; do j=i to p by 1; l=l+1; Va(|l|)=A(|j,i|); end; end; finish; ***--------------------------------------------------------------------***; **** The following subroutine is to perform vec(.) function; ***--------------------------------------------------------------------***; start vec(A,Veca); l=0; p=nrow(A); p2=p*p; Veca=j(p2,1,0); do j=1 to p; Veca[(j-1)*p+1:j*p]=A[,j]; end; finish; ***--------------------------------------------------------------------***; ** creates duplication matrix as defined in Magnus&Neudecker; ***--------------------------------------------------------------------***; start DP(p, dup); Dup=j(p*p,p*(p+1)/2,0); count=0; do j=1 to p; do i=j to p; count=count+1; if i=j then do; Dup[(j-1)*p+j, count]=1; end; else do; Dup[(j-1)*p+i, count]=1; Dup[(i-1)*p+j, count]=1; end; end; end; finish; *-----------------------------------------------------------------------*; *-----------------------------------------------------------------------*; **creating index for vec(Sigma_j) corresponding to the observed cases; *-----------------------------------------------------------------------*; start index(p,oj,indexj); index=j(p,p,0); count=0; do i=1 to p; do j=1 to p; count=count+1; index[j,i]=count; end; end; indexoj=index[oj,oj]; nj=nrow(indexoj); vecj=0; do i=1 to nj; vecj=vecj//indexoj[,i]; end; indexj=vecj[2:nj*nj+1]; finish; *-----------------------------------------------------------------------*; **computing the estimator of the asymptotic covariance of \hat\Omega_{\hat\beta}; *-----------------------------------------------------------------------*; start Ascov(varphi,mu0,sig0,x,misinfo, Abeta, Bbeta, Gamma); n=nrow(x); p=ncol(x); ps=p*(p+1)/2; pps=p+ps; run dp(p,dup); dupt=dup`; i_p=i(p); B11=j(p,p,0); B12=j(p,ps,0); B22=j(ps,ps,0); ddl11=j(p,p,0); ddl12=j(p,ps,0); ddl21=j(ps,p,0); ddl22=j(ps,ps,0); prob=1-varphi; chip=cinv(prob,p); ck=sqrt(chip); cbeta=( p*probchi(chip,p+2)+ chip*(1-prob) )/p; dl=j(pps,1,0); npat=nrow(misinfo); n1=misinfo[1,1]; p1=misinfo[1,2]; if p1=p then do;*complete data; sigin=inv(sig0); run vec(sig0,vecsig); Wmat=0.5*(sigin@sigin); do i=1 to n1; xi=x[i,]`; xi0=xi-mu0; stdxi=sigin*xi0; stdxit=stdxi`; di2=xi0`*stdxi; di=sqrt(di2); **Huber weight functions; if di<=ck then do; wi1=1.0; wi2=1.0/cbeta; dwi1=0; dwi2=0; end; else do; wi1=ck/di; wi2=wi1*wi1/cbeta; dwi1=wi1/di2; dwi2=wi2/di2; end; *for computing B_{\beta}; dlimu=wi1*stdxi; xixi0=xi0*xi0`; run vec(xixi0,vecyi); wvecyi=wi2*vecyi; dlisig=dupt*Wmat*(wvecyi-vecsig); B11=B11+dlimu*dlimu`; B12=B12+dlimu*dlisig`; B22=B22+dlisig*dlisig`; dl_i=dlimu//dlisig; dl=dl+dl_i; *for computing A_{\beta}; Hi=stdxi*stdxit; tti=wi1*sigin; uui=wi2*sigin; ddl11=ddl11+(-tti+dwi1*Hi); ddl22=ddl22+dupt*(Wmat-( Hi@(uui-.5*dwi2*Hi) ) )*dup; ddl12=ddl12+((-tti+.5*dwi1*Hi)@stdxit )*dup; ddl21=ddl21+dupt*((-uui+dwi2*Hi)@stdxi ); end; *repeat the n1 cases; end; *if p1=p; else do; chip1=cinv(prob,p1); ck1=sqrt(chip1); cbeta1=( p1*probchi(chip1,p1+2)+ chip1*(1-prob) )/p1; o1=misinfo[1,3:(2+p1)]; mu_o=mu0[o1]; sig_oo=sig0[o1,o1]; run vec(sig_oo,vecsig_oo); sigin_oo=inv(sig_oo); E1=i_p[o1,]; Et1=E1`; F1=(E1@E1)*dup; Ft1=F1`; Wmat1=0.5*(sigin_oo@sigin_oo); do i= 1 to n1; xi=x[i,]`; xi_o=xi[o1]; xi0_o=xi_o-mu_o; xi0_ot=xi0_o`; stdxi_o=sigin_oo*xi0_o; stdxit_o=stdxi_o`; di2=xi0_ot*stdxi_o; di=sqrt(di2); **Huber weight functions; if di<=ck1 then do; wi1=1.0; wi2=1.0/cbeta1; dwi1=0; dwi2=0; end; else do; wi1=ck1/di; wi2=wi1*wi1/cbeta1; dwi1=wi1/di2; dwi2=wi2/di2; end; *for computing B_{\beta}; dlimu=wi1*Et1*stdxi_o; xixi0_o=xi0_o*xi0_ot; run vec(xixi0_o,vecyi); wvecyi=wi2*vecyi; dlisig=Ft1*Wmat1*(wvecyi-vecsig_oo); B11=B11+dlimu*dlimu`; B12=B12+dlimu*dlisig`; B22=B22+dlisig*dlisig`; dl_i=dlimu//dlisig; dl=dl+dl_i; *for computing A_{\beta}; Hi=stdxi_o*stdxit_o; *tti=wi1*sigooin; tti=wi1*sigin_oo; *uui=wi2*sigooin; uui=wi2*sigin_oo; ddl11=ddl11+Et1*(-tti+dwi1*Hi)*E1; ddl22=ddl22+Ft1*(Wmat1-( Hi@(uui-.5*dwi2*Hi) ) )*F1; *Wmati to Wmat1; ddl12=ddl12+Et1*((-tti+.5*dwi1*Hi)@stdxit_o )*F1; ddl21=ddl21+Ft1*((-uui+dwi2*Hi)@stdxi_o )*E1; end; *1 to n1; end; *elsedo; *start from pattern 2; snj=n1; do j=2 to npat; nj=misinfo[j,1]; pj=misinfo[j,2]; chipj=cinv(prob,pj); ckj=sqrt(chipj); cbetaj=( pj*probchi(chipj,pj+2)+ chipj*(1-prob) )/pj; oj=misinfo[j,3:(2+pj)]; mu_o=mu0[oj]; sig_oo=sig0[oj,oj]; sigin_oo=inv(sig_oo); run vec(sig_oo,vecsig_oo); sigin_oo=inv(sig_oo); Ej=i_p[oj,]; Etj=Ej`; Fj=(Ej@Ej)*dup; Ftj=Fj`; Wmati=0.5*(sigin_oo@sigin_oo); do i=snj+1 to snj+nj; xi=x[i,]`; xi_o=xi[oj]; xi0_o=xi_o-mu_o; xi0_ot=xi0_o`; stdxi_o=sigin_oo*xi0_o; stdxit_o=stdxi_o`; di2=xi0_ot*stdxi_o; di=sqrt(di2); **Huber weight functions; if di<=ckj then do; wi1=1.0; wi2=1.0/cbetaj; dwi1=0; dwi2=0; end; else do; wi1=ckj/di; wi2=wi1*wi1/cbetaj; dwi1=wi1/di2; dwi2=wi2/di2; end; *for computing B_{\beta}; dlimu=wi1*Etj*stdxi_o; xixi0_o=xi0_o*xi0_ot; run vec(xixi0_o,vecyi); wvecyi=wi2*vecyi; dlisig=Ftj*Wmati*(wvecyi-vecsig_oo); B11=B11+dlimu*dlimu`; B12=B12+dlimu*dlisig`; B22=B22+dlisig*dlisig`; dl_i=dlimu//dlisig; dl=dl+dl_i; *for computing A_{\beta}; Hi=stdxi_o*stdxit_o; tti=wi1*sigin_oo; uui=wi2*sigin_oo; ddl11=ddl11+Etj*(-tti+dwi1*Hi)*Ej; ddl22=ddl22+Ftj*(Wmati-( Hi@(uui-.5*dwi2*Hi) ) )*Fj; ddl12=ddl12+Etj*(( (-tti+0.5*dwi1*Hi)@stdxit_o ) )*Fj; ddl21=ddl21+Ftj*((-uui+dwi2*Hi)@stdxi_o )*Ej; end; *repeat cases within jth pattern; snj=snj+nj; end; *repeat pattern; Bbeta=(B11||B12)//(B12`||B22); Abeta=(ddl11||ddl12)//(ddl21||ddl22); Abin=inv(Abeta); Omega=n*Abin*Bbeta*(Abin`); *Gamma=Omega[p+1:pps,p+1:pps]; gamma=Omega; finish; *-----------------------------------------------------------------------*; *-----------------------------------------------------------------------*; **creating index for vech(Sigma) corresponding to the selected variables; *p=#all variables; *ps=# selected variables; *V_forana contains the selected subscripts; start indexv(p,V_forana,index_s); pv=nrow(V_forana); pvs=pv*(pv+1)/2; index_s=j(pvs,1,0); count=p; countv=0; do i=1 to p; do j=i to p; count=count+1; do iv=1 to pv; do jv=iv to pv; if i=V_forana[iv] then do; if j=V_forana[jv] then do; countv=countv+1; index_s[countv]=count; end; end; end; end; end; end; index_s=V_forana//index_s; finish; *-----------------------------------------------------------------------*; *-----------------------------------------------------------------------*; **creating index for vech(Sigma) corresponding to the selected variables; *p=#all variables; *ps=# selected variables; *V_forana contains the selected subscripts; start indexvc(p,V_forana,index_s); pv=nrow(V_forana); pvs=pv*(pv+1)/2; index_s=j(pvs,1,0); count=0; countv=0; do i=1 to p; do j=i to p; count=count+1; do iv=1 to pv; do jv=iv to pv; if i=V_forana[iv] then do; if j=V_forana[jv] then do; countv=countv+1; index_s[countv]=count; end; end; end; end; end; end; index_s=index_s+j(pvs,1,p) ; finish; *-----------------------------------------------------------------------*; **generating a permutation matrix from the order of vech(A) to the vecs(A) as used by EQS; ***--------------------------------------------------------------------***; start switch(p, permuc, permu); ps=p*(p+1)/2; bmat=j(p,p,0); nb=0; do i=1 to p; do j=1 to i; nb=nb+1; bmat[i,j]=nb; end; end; run vech(bmat,vb); Imatc=i(ps); permuc=Imatc[,vb]; Imat=i(p+ps); vp=1:p; vs=vp`//( vb+j(ps,1,p) ); permu=imat[,vs]; permu=permu[(p+1):(p+ps),]//permu[1:p,]; *EQS puts the covariance first; finish; *-----------------------------------------------------------------------*; *** The following is the main program; use raw; read all var _num_ into x; close raw; n=nrow(x); print "n=" n; p=ncol(x); V_forana={1, 2, 4, 5}; *need to be specified; *V_forana={1, 2,3, 4, 5,6,7,8,9}; *need to be specified; varphi=0.10; *need to be specified; print "varphi=" varphi; p_v=nrow(V_forana); print p_v; pvs=p_v+p_v*(p_v+1)/2; run pattern(n,p,x,misinfo); print x; totpat=nrow(misinfo); print "#total observed patterns=" totpat; print "cases--#observed V--observed V--missing V="; print misinfo; run emmusig(x,misinfo,varphi,err,hmu1,hsigma1); hmu=hmu1[V_forana]`; hsigma=hsigma1[V_forana,V_forana]; print "hat\mu="; print hmu; print "hat\Sigma="; print hsigma; run Ascov(varphi,hmu1,hsigma1,x,misinfo, Abeta, Bbeta, hupsilon); run indexv(p,V_forana,index_beta); *index for both means and covariances; run indexvc(p,V_forana,index_sig); *index for only the covariances; run switch(p_v, permu_sig, permu_beta); *generating permutation matrices; print index_beta; hgamma_sig=hupsilon[index_sig,index_sig]; hgamma_sig=permu_sig*hgamma_sig*permu_sig`;*needed for the 2nd stage ML in EQS; print "\hat\Gamma_sig="; print hgamma_sig; hgamma_beta=hupsilon[index_beta,index_beta]; hgamma_beta=( permu_beta*hgamma_beta*permu_beta`||j(pvs,1,0) )//(j(1,pvs,0)||1);*needed for the 2nd stage ML in EQS; print "hat\Gamma_beta="; print hgamma_beta;