Post date: Sep 30, 2015 8:6:5 PM
We want to know how well gemma works (PVE and sparse effects) under different genetic architectures that could reflect traits and fitness in field experiments. This is relevant for a few grants and for a paper Patrik and I are working on about detecting and measuring selection in the wild from genomic data. The files referenced below are in: /labs/evolution/projects/timema_speciation/timema_genarch_sims/ (moving soon). fha2013_gemma_infiles.tar.bz2 contains real data from a GBS mapping population (FHA) and the individual genotype file with all individuals was used for the simulations.
We considered the following conditions: N = 2000 individuals, h2 = 0.3, quantitative or binary traits, 5, 50 or 500 functional variants with effects drawn from a standard normal distribution. Genotyped individuals were re-sampled with replacement to generate the population of 2000 individuals and functional variants were selected randomly from the SNPs. Here is the initial R script used to generate the simulated data (sims.R):
nl<-246258
n<-592
G<-matrix(scan("pntestTimemaGbs.txt",n=n*nl,sep=" "),nrow=nl,ncol=n,byrow=TRUE)
Ghat<-round(G,0)
sam<-sample(1:592,2000,replace=TRUE)
Gsam<-Ghat[,sam]
nr<-100
## data set 1, h2 = 0.3, nsnps = 50, normal distribution
funct1m<-matrix(NA,nrow=50,ncol=nr)
effect1m<-matrix(NA,nrow=50,ncol=nr)
bv1m<-matrix(NA,nrow=2000,ncol=nr)
ph1m<-matrix(NA,nrow=2000,ncol=nr)
her1<-rep(NA,nr)
for(x in 1:nr){
funct1<-sample(1:nl,50,replace=FALSE)
effect1<-rnorm(50,0,1)
bv1<-rep(NA,2000)
for(i in 1:2000){
bv1[i]<-sum(Gsam[funct1,i] * effect1)
}
v1<--1 * ((0.3 - 1)*var(bv1))/0.3
ph1<-bv1 + rnorm(2000,0,sqrt(v1))
her1[x]<-var(bv1)/var(ph1)
funct1m[,x]<-funct1
effect1m[,x]<-effect1
bv1m[,x]<-bv1
ph1m[,x]<-(ph1-mean(ph1))/sd(ph1)
}
## data set 2, h2 = 0.3, nsnps = 5, normal distribution
funct2m<-matrix(NA,nrow=5,ncol=nr)
effect2m<-matrix(NA,nrow=5,ncol=nr)
bv2m<-matrix(NA,nrow=2000,ncol=nr)
ph2m<-matrix(NA,nrow=2000,ncol=nr)
her2<-rep(NA,nr)
for(x in 1:nr){
funct2<-sample(1:nl,5,replace=FALSE)
effect2<-rnorm(5,0,1)
bv2<-rep(NA,2000)
for(i in 1:2000){
bv2[i]<-sum(Gsam[funct2,i] * effect2)
}
v2<--1 * ((0.3 - 1)*var(bv2))/0.3
ph2<-bv2 + rnorm(2000,0,sqrt(v2))
her2<-var(bv2)/var(ph2) ## 0.309
funct2m[,x]<-funct2
effect2m[,x]<-effect2
bv2m[,x]<-bv2
ph2m[,x]<-(ph2-mean(ph2))/sd(ph2)
}
## data set 3, h2 = 0.3, nsnps = 500, normal distribution
funct3m<-matrix(NA,nrow=500,ncol=nr)
effect3m<-matrix(NA,nrow=500,ncol=nr)
bv3m<-matrix(NA,nrow=2000,ncol=nr)
ph3m<-matrix(NA,nrow=2000,ncol=nr)
her3<-rep(NA,nr)
for(x in 1:nr){
funct3<-sample(1:nl,500,replace=FALSE)
effect3<-rnorm(500,0,1)
bv3<-rep(NA,2000)
for(i in 1:2000){
bv3[i]<-sum(Gsam[funct3,i] * effect3)
}
v3<--1 * ((0.3 - 1)*var(bv3))/0.3
ph3<-bv3 + rnorm(2000,0,sqrt(v3))
her3<-var(bv3)/var(ph3)
funct3m[,x]<-funct3
effect3m[,x]<-effect3
bv3m[,x]<-bv3
ph3m[,x]<-(ph3-mean(ph3))/sd(ph3)
}
Gout<-cbind(1:nl,rep("A",nl),rep("T",nl),Gsam)
write.table(Gout,"geno_sims.txt",row.names=F,col.names=F,quote=F)
write.table(ph1m,"pheno_sims1.txt",row.names=F,col.names=F,quote=F)
write.table(ph2m,"pheno_sims2.txt",row.names=F,col.names=F,quote=F)
write.table(ph3m,"pheno_sims3.txt",row.names=F,col.names=F,quote=F)
write.table(bv1m,"bv_sims1.txt",row.names=F,col.names=F,quote=F)
write.table(bv2m,"bv_sims2.txt",row.names=F,col.names=F,quote=F)
write.table(bv3m,"bv_sims3.txt",row.names=F,col.names=F,quote=F)
write.table(effect1m,"effect_sims1.txt",row.names=F,col.names=F,quote=F)
write.table(effect2m,"effect_sims2.txt",row.names=F,col.names=F,quote=F)
write.table(effect3m,"effect_sims3.txt",row.names=F,col.names=F,quote=F)
write.table(funct1m,"functvar_sims1.txt",row.names=F,col.names=F,quote=F)
write.table(funct2m,"functvar_sims2.txt",row.names=F,col.names=F,quote=F)
write.table(funct3m,"functvar_sims3.txt",row.names=F,col.names=F,quote=F)
save(list=ls(),file="sims.Rdata")
Additional data sets were generated for binary variables using libaility2binary.R:
## convert quantitative phenotypes (latent liability variable) to binary pheontypes
## assumes population and sample proportion of successes = 0.5 (i.e. half live half die)
N<-2000
sims<-100
K<-0.5 ## population prevalence
bin<-rep(c(0,1),each=1000)
ph1<-matrix(scan("pheno_sims1.txt",n=N*sims,sep=" "),nrow=N,ncol=sims,byrow=TRUE)
bin1<-ph1
for(i in 1:sims){
bin1[,i]<-bin[rank(ph1[,i])]
}
ph2<-matrix(scan("pheno_sims2.txt",n=N*sims,sep=" "),nrow=N,ncol=sims,byrow=TRUE)
bin2<-ph2
for(i in 1:sims){
bin2[,i]<-bin[rank(ph2[,i])]
}
ph3<-matrix(scan("pheno_sims3.txt",n=N*sims,sep=" "),nrow=N,ncol=sims,byrow=TRUE)
bin3<-ph3
for(i in 1:sims){
bin3[,i]<-bin[rank(ph3[,i])]
}
write.table(bin1,"phenobin_sims1.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)
write.table(bin2,"phenobin_sims2.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)
write.table(bin3,"phenobin_sims3.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)
This assumes 50% success (survival). Read more about liability to binary here. Gemma will generate an estimate of h2 on the observed (0,1) scale, which is not necessarily ideal. To convert to heritability on the liability scale use h2l = h20 * K (1-K)/z^2 (eqn 17) where in this case K = 0.5 and z = dnorm(0,0,1) = 0.3989423 (this is for a 50:50 binary trait).
All six data sets (three genetic architectures and binary or quantitative traits) were analyzed with the BSLMM in gemma with these conditions (2 chains each; results are in /pscratch/A01963476/sims/output/)
gemma -g /labs/evolution/projects/timema_speciation/timema_genarch_sims/geno_sims.geno -p /labs/evolution/projects/timema_speciation/timema_genarch_sims/phenobin_sims1.txt -bslmm 1 -n 100 -o out_sim1phbin100rep1 -rpace 100 -maf 0.01 -w 1000000 -s 2000000