## data process
## process the piat math data

piat.orig<-read.table('piatmath-age-grade.dat.txt', na.string='.')
names(piat.orig)<-c('id','sex', 'race', 'grade1997', 'grade1998', 'grade1999', 'grade2000', 'grade2001', 'grade2002', 'age1997', 'age1998', 'age1999', 'age2000', 'age2001', 'age2002', 'piat1997', 'piat1998', 'piat1999', 'piat2000', 'piat2001', 'piat2002')

## select data based on missingness
M<-is.na(piat.orig)

M.sum<-cbind(M[,1:3], apply(M[,4:9],1,sum), apply(M[,10:15],1,sum), apply(M[,16:21],1,sum))
table(M.sum[,6])

## choose data with complete data on sex and age
piat.comp.age<-piat.orig[M.sum[,1]==0,]
dim(piat.comp.age)

## choose piat with at least two measurements
piat.2<-piat.orig[M.sum[,6]<5 & M.sum[,1]==0 & M.sum[,2]==0 & M.sum[,3]==0 & M.sum[,4]==0 & M.sum[,5]==0, ]
dim(piat.2)

boxplot(piat2000~sex, data=piat.2)
boxplot(piat2000~race, data=piat.2)
boxplot(piat2000~age1997, data=piat.2)
boxplot(piat2000~grade1997, data=piat.2)

table(piat.2$sex)
table(piat.2$race)
table(piat.2$age1997)
table(piat.2$grade1997)

## choose piat with at least three measurements
piat.3<-piat.orig[M.sum[,6]<4 & M.sum[,1]==0 & M.sum[,2]==0 & M.sum[,3]==0 & M.sum[,4]==0 & M.sum[,5]==0, ]
dim(piat.3)

## at least two data points at grade 6
piat.t2.g6<-piat.2[piat.2$grade1997==6, ]
dim(piat.t2.g6)

## at least two data points at grade 6, age12
piat.t2.g6.a12<-piat.2[piat.2$grade1997==6 & piat.2$age1997==12, ]
dim(piat.t2.g6.a12)

## at least two data points at grade 6, age12, non-black
piat.t2.g6.a12.nb<-piat.2[piat.2$grade1997==6 & piat.2$age1997==12 & piat.2$race==4, ]
dim(piat.t2.g6.a12.nb)
par(mfrow=c(2,3))
apply(piat.t2.g6.a12.nb[, 16:21], 2, hist)

## at least two data points at grade 6, non-black
piat.t2.g6.nb<-piat.2[piat.2$grade1997==6 & piat.2$age1997==12 & piat.2$race==4, ]
dim(piat.t2.g6.nb)
par(mfrow=c(2,3))
apply(piat.t2.g6.nb[, 16:21], 2, hist, breaks=20)


## at least two data points, age12, non-black
piat.t2.a12.nb<-piat.2[piat.2$grade1997==6 & piat.2$age1997==12 & piat.2$race==4, ]
dim(piat.t2.a12.nb)
par(mfrow=c(2,3))
apply(piat.t2.a12.nb[, 16:21], 2, hist, breaks=20)

n<-dim(piat)[1]

plot(c(piat[1, 3:8]), c(piat[1, 9:14]), ylim=c(0,100), xlim=c(12,19), type='l')
plot(c(piat[1, 3:8]), c(piat[1, 9:14]), ylim=c(0,100), xlim=c(12,19))

for (i in 2:n){
  points(c(piat[i, 3:8]), c(piat[i, 9:14]))
  lines(c(piat[i, 3:8]), c(piat[i, 9:14]))
}


## age at 18
## this is the data for analysis
piat.18<-piat[piat[,8]==18,]
dim(piat.18)

n<-dim(piat.18)[1]

plot(13:18, c(piat.18[1, 9:14]), ylim=c(0,100), type='l')
plot(13:18, c(piat.18[1, 9:14]), ylim=c(0,100))

for (i in 2:100){
  points(13:18, c(piat.18[i, 9:14]))
  lines(13:18, c(piat.18[i, 9:14]))
}

## write data for Laura for mixture modeling
write.table(piat[9:13], 'piatmath-trymixture.txt', row.names=F, col.names=F)


## some basic analysis
apply(piat.18[, 9:14], 2, boxplot, na.rm=T)
apply(piat.18[, 9:14], 2, hist, na.rm=T)

boxplot(piat.18[, 9:14], names=c('1997', '1998', '1999', '2000', '2001', '2002'))
boxplot(log10(piat.18[, 9:14]), names=c('1997', '1998', '1999', '2000', '2001', '2002'))

apply(sqrt(piat.18[, 9:14]), 2, hist, na.rm=T)