simulations from T. cristinae GBS data

Post date: Sep 30, 2015 8:6:5 PM

We want to know how well gemma works (PVE and sparse effects) under different genetic architectures that could reflect traits and fitness in field experiments. This is relevant for a few grants and for a paper Patrik and I are working on about detecting and measuring selection in the wild from genomic data. The files referenced below are in: /labs/evolution/projects/timema_speciation/timema_genarch_sims/ (moving soon). fha2013_gemma_infiles.tar.bz2 contains real data from a GBS mapping population (FHA) and the individual genotype file with all individuals was used for the simulations.

We considered the following conditions: N = 2000 individuals, h2 = 0.3, quantitative or binary traits, 5, 50 or 500 functional variants with effects drawn from a standard normal distribution. Genotyped individuals were re-sampled with replacement to generate the population of 2000 individuals and functional variants were selected randomly from the SNPs. Here is the initial R script used to generate the simulated data (sims.R):

nl<-246258

n<-592

G<-matrix(scan("pntestTimemaGbs.txt",n=n*nl,sep=" "),nrow=nl,ncol=n,byrow=TRUE)

Ghat<-round(G,0)

sam<-sample(1:592,2000,replace=TRUE)

Gsam<-Ghat[,sam]

nr<-100

## data set 1, h2 = 0.3, nsnps = 50, normal distribution

funct1m<-matrix(NA,nrow=50,ncol=nr)

effect1m<-matrix(NA,nrow=50,ncol=nr)

bv1m<-matrix(NA,nrow=2000,ncol=nr)

ph1m<-matrix(NA,nrow=2000,ncol=nr)

her1<-rep(NA,nr)

for(x in 1:nr){

funct1<-sample(1:nl,50,replace=FALSE)

effect1<-rnorm(50,0,1)

bv1<-rep(NA,2000)

for(i in 1:2000){

bv1[i]<-sum(Gsam[funct1,i] * effect1)

}

v1<--1 * ((0.3 - 1)*var(bv1))/0.3

ph1<-bv1 + rnorm(2000,0,sqrt(v1))

her1[x]<-var(bv1)/var(ph1)

funct1m[,x]<-funct1

effect1m[,x]<-effect1

bv1m[,x]<-bv1

ph1m[,x]<-(ph1-mean(ph1))/sd(ph1)

}

## data set 2, h2 = 0.3, nsnps = 5, normal distribution

funct2m<-matrix(NA,nrow=5,ncol=nr)

effect2m<-matrix(NA,nrow=5,ncol=nr)

bv2m<-matrix(NA,nrow=2000,ncol=nr)

ph2m<-matrix(NA,nrow=2000,ncol=nr)

her2<-rep(NA,nr)

for(x in 1:nr){

funct2<-sample(1:nl,5,replace=FALSE)

effect2<-rnorm(5,0,1)

bv2<-rep(NA,2000)

for(i in 1:2000){

bv2[i]<-sum(Gsam[funct2,i] * effect2)

}

v2<--1 * ((0.3 - 1)*var(bv2))/0.3

ph2<-bv2 + rnorm(2000,0,sqrt(v2))

her2<-var(bv2)/var(ph2) ## 0.309

funct2m[,x]<-funct2

effect2m[,x]<-effect2

bv2m[,x]<-bv2

ph2m[,x]<-(ph2-mean(ph2))/sd(ph2)

}

## data set 3, h2 = 0.3, nsnps = 500, normal distribution

funct3m<-matrix(NA,nrow=500,ncol=nr)

effect3m<-matrix(NA,nrow=500,ncol=nr)

bv3m<-matrix(NA,nrow=2000,ncol=nr)

ph3m<-matrix(NA,nrow=2000,ncol=nr)

her3<-rep(NA,nr)

for(x in 1:nr){

funct3<-sample(1:nl,500,replace=FALSE)

effect3<-rnorm(500,0,1)

bv3<-rep(NA,2000)

for(i in 1:2000){

bv3[i]<-sum(Gsam[funct3,i] * effect3)

}

v3<--1 * ((0.3 - 1)*var(bv3))/0.3

ph3<-bv3 + rnorm(2000,0,sqrt(v3))

her3<-var(bv3)/var(ph3)

funct3m[,x]<-funct3

effect3m[,x]<-effect3

bv3m[,x]<-bv3

ph3m[,x]<-(ph3-mean(ph3))/sd(ph3)

}

Gout<-cbind(1:nl,rep("A",nl),rep("T",nl),Gsam)

write.table(Gout,"geno_sims.txt",row.names=F,col.names=F,quote=F)

write.table(ph1m,"pheno_sims1.txt",row.names=F,col.names=F,quote=F)

write.table(ph2m,"pheno_sims2.txt",row.names=F,col.names=F,quote=F)

write.table(ph3m,"pheno_sims3.txt",row.names=F,col.names=F,quote=F)

write.table(bv1m,"bv_sims1.txt",row.names=F,col.names=F,quote=F)

write.table(bv2m,"bv_sims2.txt",row.names=F,col.names=F,quote=F)

write.table(bv3m,"bv_sims3.txt",row.names=F,col.names=F,quote=F)

write.table(effect1m,"effect_sims1.txt",row.names=F,col.names=F,quote=F)

write.table(effect2m,"effect_sims2.txt",row.names=F,col.names=F,quote=F)

write.table(effect3m,"effect_sims3.txt",row.names=F,col.names=F,quote=F)

write.table(funct1m,"functvar_sims1.txt",row.names=F,col.names=F,quote=F)

write.table(funct2m,"functvar_sims2.txt",row.names=F,col.names=F,quote=F)

write.table(funct3m,"functvar_sims3.txt",row.names=F,col.names=F,quote=F)

save(list=ls(),file="sims.Rdata")

Additional data sets were generated for binary variables using libaility2binary.R:

## convert quantitative phenotypes (latent liability variable) to binary pheontypes

## assumes population and sample proportion of successes = 0.5 (i.e. half live half die)

N<-2000

sims<-100

K<-0.5 ## population prevalence

bin<-rep(c(0,1),each=1000)

ph1<-matrix(scan("pheno_sims1.txt",n=N*sims,sep=" "),nrow=N,ncol=sims,byrow=TRUE)

bin1<-ph1

for(i in 1:sims){

bin1[,i]<-bin[rank(ph1[,i])]

}

ph2<-matrix(scan("pheno_sims2.txt",n=N*sims,sep=" "),nrow=N,ncol=sims,byrow=TRUE)

bin2<-ph2

for(i in 1:sims){

bin2[,i]<-bin[rank(ph2[,i])]

}

ph3<-matrix(scan("pheno_sims3.txt",n=N*sims,sep=" "),nrow=N,ncol=sims,byrow=TRUE)

bin3<-ph3

for(i in 1:sims){

bin3[,i]<-bin[rank(ph3[,i])]

}

write.table(bin1,"phenobin_sims1.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)

write.table(bin2,"phenobin_sims2.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)

write.table(bin3,"phenobin_sims3.txt",row.names=FALSE,col.names=FALSE,quote=FALSE)

This assumes 50% success (survival). Read more about liability to binary here. Gemma will generate an estimate of h2 on the observed (0,1) scale, which is not necessarily ideal. To convert to heritability on the liability scale use h2l = h20 * K (1-K)/z^2 (eqn 17) where in this case K = 0.5 and z = dnorm(0,0,1) = 0.3989423 (this is for a 50:50 binary trait).

All six data sets (three genetic architectures and binary or quantitative traits) were analyzed with the BSLMM in gemma with these conditions (2 chains each; results are in /pscratch/A01963476/sims/output/)

gemma -g /labs/evolution/projects/timema_speciation/timema_genarch_sims/geno_sims.geno -p /labs/evolution/projects/timema_speciation/timema_genarch_sims/phenobin_sims1.txt -bslmm 1 -n 100 -o out_sim1phbin100rep1 -rpace 100 -maf 0.01 -w 1000000 -s 2000000