Folder for the analyses: /uufs/chpc.utah.edu/common/home/u6007910/projects/lyc_dubois/pairwiseLD/polarizingLD
Step 1: Flip the genotypes using the script polarizingLD_files.R saved in the same folder mentioned above. Below is the code in this script. I ran this script by submitting as a job using the bash script runPolarizingLD.R.
This step creates the files with flipped genotypes for each population. The files are named flip_genEstMatrix_*. I moved these files to the folder "allsnps".
######## getting allele frequency differences #############
### read in the AF files
idas_pure<-read.table("../../popanc/infiles/comb_idas_pure.txt", skip=1, header=F)
idas_pure_m<-as.matrix(idas_pure[,-1])
idas_pure_p<-apply(idas_pure_m,1,mean)/2
mel_sub<-read.table("../../popanc/infiles/comb_melissa_sub.txt", skip=1, header=F)
mel_sub_m<-as.matrix(mel_sub[,-1])
mel_sub_p<-apply(mel_sub_m,1,mean)/2
num<-c(1:39193)
### get the genotype files for reference ######
### aims 3 #####
gen_a3<-read.table("../../clines/difSnps30pct.txt", header=F)
gen_a3<-cbind(num,gen_a3)
gen_a3<-as.matrix(gen_a3)
### getting aims loci
afdiff<-idas_pure_p-mel_sub_p
#for loop to flip genotypes
#files= list.files(path="../../entropy/complete/mcmc", pattern = "genEstMatrix_", full.names = TRUE)
files= list.files(path="./aims3", pattern = "genEstMatrix_", full.names = TRUE)
flipgeno<-function(input){
pop<-read.table(input, header=T)
gen_af<-as.data.frame(cbind(afdiff,pop))
for(i in 1:nrow(gen_af)){
if(gen_af[,1][i] < 0){
#print(gen_af$gen[i])
gen_af[i,][2:ncol(gen_af)] <- 2-gen_af[i,][2:ncol(gen_af)]
index = which(files == input)
outname = paste('flip', basename(input), sep = "_")
write.table(gen_af[,-1], file=outname, sep=" ", row.names=FALSE, quote=FALSE)
}
}
}
for (i in 1:length(files)){
flipgeno(files[i])
}
Step 2: Calculate within and across scaffold LD for AIMS
I did this analyses in the folder: /uufs/chpc.utah.edu/common/home/u6007910/projects/lyc_dubois/pairwiseLD/polarizingLD/aims3/
Here is the code for creating flipped files for AIMS3 SNPs:
#In R
#read in the mappos file
mp<-read.table("../../../popanc/mappos.txt",sep=":",header=F)
#loop for creating the subsetted files
files= list.files(path="../allsnps", pattern = "flip_genEstMatrix_", full.names = TRUE)
combine<-function(input){
pop<-read.table(input, header=T)
pop_mp<-cbind(mp,pop)
index = which(files == input)
outname = paste('mp', basename(input), sep = "_")
write.table(pop_mp, file=outname, sep=" ", col.names=TRUE, row.names=FALSE, quote=FALSE)
}
for (i in 1:length(files)){
combine(files[i])
}
This will create files named: mp_flip_genEstMatrix_*
#In bash
## subset aims SNPs using python script
for f in mp_flip_genEstMatrix_*; do
python subset_aims_genoEst.py $f aims3_${f}; done
This will create files named: aims3_mp_flip_genEstMatrix_*
2. Run LD using calcLD_scafs.R on aims3_mp_flip_genEstMatrix* files. The script is saved in the "aims3" folder. I copy pasted the code in R and just ran it there directly. Here is the code:
#read in the mappos file
scafpos<-read.table("./mappos3.txt",header=F)
#aims
files= list.files(path=".", pattern = "aims3_mp_flip_genEstMatrix_", full.names = TRUE)
## function to compute scaffold LD
calcLDdu = function(input){
pop<-read.table(input, header=F)
pop<-cbind(scafpos,pop) #N individuals (rows) X S snps (cols)
#print(pop)
#print(pop[,-c(1,2)])
#drop SNPs with MAF < 0.05
#relevant_row_num = which((rowMeans(pop[,-c(1,2)], na.rm = T))/2 > 0.05)
#print(relevant_row_num)
#pop_sub<-pop[relevant_row_num, ]
geno<-t(pop) # snps are cols, individuals are rows, #scaf pos retained as col
#print(geno)
# Example: some positions are dropped
# 1 2 3 4 5 6 8 9 10 11
# scaffold 4 4 4 4 4 4 4 6 6 6
# position 10035 10038 10041 10054 10059 10066 21035 21041 21044 21048
# V1 0 NA 0 1 0 2 2 0 2 0
# V2 0 1 0 0 0 1 0 1 0 1
nsnps<-1:ncol(geno)
out_col = ncol(geno)
out_row = out_col
Scafmat<-matrix(NA, nrow=out_row, ncol=out_col)
Du<-matrix(NA, nrow=out_row, ncol=out_col)
for (i in 1:(out_col-1)){
for (j in i:(out_col-1)){
if (i != j){
# Subset the col for computation
snp12<-geno[,c(i,j)]
scaf<-snp12[1, ]
if (scaf[1] == scaf[2]){
#print(scaf[1])
#print(scaf[2])
Scafmat[i,j]<- 1
}
else {
Scafmat[i,j]<- 0
}
#print(Scafmat)
#print(paste(" Scaff:", scaff_id, "Proc -> Mat(", i, j, ") S->(", scaf[1], scaf[2],"), P->(", pos[1], pos[2],")"))
tmp.geno <- snp12[!(is.na(snp12[,1])) & !(is.na(snp12[,2])),]
N <- dim(tmp.geno)[1]
pA <- (2*sum(tmp.geno[,1]==0) + sum(tmp.geno[,1]==1)) / (2*N)
pB <- (2*sum(tmp.geno[,2]==0) + sum(tmp.geno[,2]==1)) / (2*N)
pa <- 1-pA
pb <- 1-pB
# pAB <- (sum(tmp.geno[,1]==0 + tmp.geno[,2]==0)*2) + sum(tmp.geno[1,]==1 + tmp.geno[2,]==0) + sum(tmp.geno[1,]==0 + tmp.geno[2,]==1) + sum(tmp.geno[1,]==1 + tmp.geno[2,]==1) / (2*N)
nAABB = sum(tmp.geno[,1]==0 & tmp.geno[,2]==0) # 00
nAABb = sum(tmp.geno[,1]==0 & tmp.geno[,2]==1) # 01
nAaBB = sum(tmp.geno[,1]==1 & tmp.geno[,2]==0) # 10
nAaBb = sum(tmp.geno[,1]==1 & tmp.geno[,2]==1) # 11
###### calculate unbiased estimator D ##################
Du[i,j]<-(N / (N-1)) * (((4*nAABB + 2*(nAABb + nAaBB) + nAaBb)/ (2*N))-(2*pA*pB))
}
}
}
#get vectors to write out
ld<-Du[upper.tri(Du)]
#print(ld)
scafbin<-Scafmat[upper.tri(Scafmat)]
#print(scafbin)
ld_scaf<-cbind(scafbin,ld)
#print(ld_scaf)
index = which(files == input)
outname = paste('ldscaf', basename(input), sep = "_")
write.table(ld_scaf, file=outname, sep=" ", row.names=FALSE, quote=FALSE)
}
## apply the function for all the population files
for (i in 1:length(files)){
calcLDdu(files[i])
}
3. Plot LD from the code in plotLDscafs.R
Here is the code:
##plot parents vs DBS
ld_dbs<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_DBS.txt", header=T)
ld_cdy<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_CDY.txt", header=T)
ld_ckv<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_CKV.txt", header=T)
ld_sin<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_SIN.txt", header=T)
ld_bst<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_BST.txt", header=T)
ld_khl<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_KHL.txt", header=T)
ld_sdc<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_SDC.txt", header=T)
ld_syc<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_SYC.txt", header=T)
#trial plot
par(mfrow=c(2,2))
boxplot(ld~scafbin, data=ld_bld, ylab="Unbiased D", names=c("Within","Between"), main="BLD (N=74)")
boxplot(ld~scafbin, data=ld_frc, ylab="Unbiased D", names=c("Within","Between"), main="FRC (N=20)")
boxplot(ld~scafbin, data=ld_ckv, ylab="Unbiased D", names=c("Within","Between"), main="CKV (N=10)")
boxplot(ld~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Within","Between"), main="DBS (N=115)")
par(mfrow=c(2,2))
boxplot(abs(ld)~scafbin, data=ld_bld, ylab="Unbiased D", names=c("Within","Between"), main="BLD (N=74)")
boxplot(abs(ld)~scafbin, data=ld_frc, ylab="Unbiased D", names=c("Within","Between"), main="FRC (N=20)")
boxplot(abs(ld)~scafbin, data=ld_ckv, ylab="Unbiased D", names=c("Within","Between"), main="CKV (N=10)")
boxplot(abs(ld)~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Within","Between"), main="DBS (N=115)")
#dbs versus other high pop size pops
ld_bcr<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_BCR.txt", header=T)
ld_bld<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_BLD.txt", header=T)
ld_btb<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_BTB.txt", header=T)
ld_gnp<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_GNP.txt", header=T)
ld_sin<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_SIN.txt", header=T)
ld_dbs<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_DBS.txt", header=T)
pdf("ldscafs_dbs_parents.pdf", width = 10, height = 8)
par(mfrow=c(4,2))
par(mar=c(2,7,1,6.5))
boxplot(ld~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Between","Within"), main="DBS (N=115)", yaxt="n")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_cdy, ylab="Unbiased D", names=c("Between", "Within"), main="CDY (N=23)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_ckv, ylab="Unbiased D", names=c("Between", "Within"), main="CKV (N=10)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_sin, ylab="Unbiased D", names=c("Between","Within"), main="SIN (N=97)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_bst, ylab="Unbiased D", names=c("Between","Within"), main="BST (N=24)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_khl, ylab="Unbiased D", names=c("Between","Within"), main="KHL (N=18)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_sdc, ylab="Unbiased D", names=c("Between","Within"), main="SDC (N=20)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_syc, ylab="Unbiased D", names=c("Between","Within"), main="SYC (N=20)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
dev.off()
## dbs and high pops ##
pdf("ldscafs_dbs_highpopsize.pdf", width = 10, height = 8)
par(mfrow=c(2,3))
par(mar=c(2,7,1,6.5))
#dbs
boxplot(ld~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Between","Within"), main="DBS (N=115)", yaxt="n")
ytick<-seq(-0.2,0.4,by=0.2)
axis(side=2, at=ytick, labels = T)
#bcr
boxplot(ld~scafbin, data=ld_bcr, ylab="Unbiased D", names=c("Between","Within"), main="BCR (N=46)", yaxt="n")
ytick<-seq(-0.2,0.4,by=0.2)
axis(side=2, at=ytick, labels = T)
#bld
boxplot(ld~scafbin, data=ld_bld, ylab="Unbiased D", names=c("Between","Within"), main="BLD (N=74)", yaxt="n")
ytick<-seq(-0.2,0.4,by=0.2)
axis(side=2, at=ytick, labels = T)
#btb
boxplot(ld~scafbin, data=ld_btb, ylab="Unbiased D", names=c("Between","Within"), main="BTB (N=46)", yaxt="n")
ytick<-seq(-0.2,0.4,by=0.2)
axis(side=2, at=ytick, labels = T)
#gnp
boxplot(ld~scafbin, data=ld_sin, ylab="Unbiased D", names=c("Between","Within"), main="GNP (N=98)", yaxt="n")
ytick<-seq(-0.2,0.4,by=0.2)
axis(side=2, at=ytick, labels = T)
#sin
boxplot(ld~scafbin, data=ld_sin, ylab="Unbiased D", names=c("Between","Within"), main="SIN (N=97)", yaxt="n")
ytick<-seq(-0.2,0.4,by=0.2)
axis(side=2, at=ytick, labels = T)
dev.off()
#### plotting within and across scaffolds separately
##within
#subset rows with 1
dbs_within<-ld_dbs[which(ld_dbs$scafbin == 1),]
bcr_within<-ld_bcr[which(ld_bcr$scafbin == 1),]
bld_within<-ld_bld[which(ld_bld$scafbin == 1),]
btb_within<-ld_btb[which(ld_btb$scafbin == 1),]
gnp_within<-ld_gnp[which(ld_gnp$scafbin == 1),]
sin_within<-ld_sin[which(ld_sin$scafbin == 1),]
#create vectors
bld_rep<-rep("bld", dim(bld_within)[1])
btb_rep<-rep("btb", dim(btb_within)[1])
sin_rep<-rep("sin", dim(sin_within)[1])
bcr_rep<-rep("bcr", dim(bcr_within)[1])
gnp_rep<-rep("gnp", dim(gnp_within)[1])
dbs_rep<-rep("dbs", dim(dbs_within)[1])
#combine
dbs_within<-cbind(dbs_rep,dbs_within)
bcr_within<-cbind(bcr_rep,bcr_within)
bld_within<-cbind(bld_rep,bld_within)
btb_within<-cbind(btb_rep,btb_within)
sin_within<-cbind(sin_rep,sin_within)
gnp_within<-cbind(gnp_rep,gnp_within)
colnames(dbs_within)<-c("pop","scaf","ld")
colnames(bcr_within)<-c("pop","scaf","ld")
colnames(bld_within)<-c("pop","scaf","ld")
colnames(btb_within)<-c("pop","scaf","ld")
colnames(sin_within)<-c("pop","scaf","ld")
colnames(gnp_within)<-c("pop","scaf","ld")
#combine all the pops
all_within<-rbind(dbs_within,bcr_within,bld_within,btb_within,sin_within,gnp_within)
boxplot(ld~pop, data=all_within)
##across
#subset
dbs_across<-ld_dbs[which(ld_dbs$scafbin == 0),]
bcr_across<-ld_bcr[which(ld_bcr$scafbin == 0),]
bld_across<-ld_bld[which(ld_bld$scafbin == 0),]
btb_across<-ld_btb[which(ld_btb$scafbin == 0),]
gnp_across<-ld_gnp[which(ld_gnp$scafbin == 0),]
sin_across<-ld_sin[which(ld_sin$scafbin == 0),]
#vectors
bld_rep<-rep("bld", dim(bld_across)[1])
btb_rep<-rep("btb", dim(btb_across)[1])
sin_rep<-rep("sin", dim(sin_across)[1])
bcr_rep<-rep("bcr", dim(bcr_across)[1])
gnp_rep<-rep("gnp", dim(gnp_across)[1])
dbs_rep<-rep("dbs", dim(dbs_across)[1])
#combine
dbs_across<-cbind(dbs_rep,dbs_across)
bcr_across<-cbind(bcr_rep,bcr_across)
bld_across<-cbind(bld_rep,bld_across)
btb_across<-cbind(btb_rep,btb_across)
sin_across<-cbind(sin_rep,sin_across)
gnp_across<-cbind(gnp_rep,gnp_across)
colnames(dbs_across)<-c("pop","scaf","ld")
colnames(bcr_across)<-c("pop","scaf","ld")
colnames(bld_across)<-c("pop","scaf","ld")
colnames(btb_across)<-c("pop","scaf","ld")
colnames(sin_across)<-c("pop","scaf","ld")
colnames(gnp_across)<-c("pop","scaf","ld")
#combine pops
all_across<-rbind(dbs_across,bcr_across,bld_across,btb_across,sin_across,gnp_across)
boxplot(ld~pop, data=all_across)
#plot
pdf("ldscafs_highpopsizes.pdf", width=13, height=12)
par(mfrow=c(2,1))
boxplot(ld~pop, data=all_within, ylab="Unbiased D", names=c("dbs-115","bcr-46","bld-74","btb-46","sin-97","gnp-98"), main="Within")
boxplot(ld~pop, data=all_across, ylab="Unbiased D", names=c("dbs-115","bcr-46","bld-74","btb-46","sin-97","gnp-98"), main="Across")
dev.off()
###plotting quantiles to compare DBS and SIN
#subset within and across
dbs_within<-ld_dbs[which(ld_dbs$scafbin == 1),]
dbs_across<-ld_dbs[which(ld_dbs$scafbin == 0),]
sin_within<-ld_sin[which(ld_sin$scafbin == 1),]
sin_across<-ld_sin[which(ld_sin$scafbin == 0),]
bld_within<-ld_bld[which(ld_bld$scafbin == 1),]
bld_across<-ld_bld[which(ld_bld$scafbin == 0),]
bcr_within<-ld_bcr[which(ld_bcr$scafbin == 1),]
bcr_across<-ld_bcr[which(ld_bcr$scafbin == 0),]
btb_within<-ld_btb[which(ld_btb$scafbin == 1),]
btb_across<-ld_btb[which(ld_btb$scafbin == 0),]
gnp_within<-ld_gnp[which(ld_gnp$scafbin == 1),]
gnp_across<-ld_gnp[which(ld_gnp$scafbin == 0),]
#take quantiles
#within
dbs_within_quant<-quantile(dbs_within$ld, probs=seq(0,1,0.01), na.rm=T)
sin_within_quant<-quantile(sin_within$ld, probs=seq(0,1,0.01), na.rm=T)
bcr_within_quant<-quantile(bcr_within$ld, probs=seq(0,1,0.01), na.rm=T)
bld_within_quant<-quantile(bld_within$ld, probs=seq(0,1,0.01), na.rm=T)
btb_within_quant<-quantile(btb_within$ld, probs=seq(0,1,0.01), na.rm=T)
gnp_within_quant<-quantile(gnp_within$ld, probs=seq(0,1,0.01), na.rm=T)
#across
dbs_across_quant<-quantile(dbs_across$ld, probs=seq(0,1,0.01), na.rm=T)
sin_across_quant<-quantile(sin_across$ld, probs=seq(0,1,0.01), na.rm=T)
bcr_across_quant<-quantile(bcr_across$ld, probs=seq(0,1,0.01), na.rm=T)
bld_across_quant<-quantile(bld_across$ld, probs=seq(0,1,0.01), na.rm=T)
btb_across_quant<-quantile(btb_across$ld, probs=seq(0,1,0.01), na.rm=T)
gnp_across_quant<-quantile(gnp_across$ld, probs=seq(0,1,0.01), na.rm=T)
## plot
pdf("qqplot_highpopsize_within.pdf", width=10, height=8)
#within
par(mfrow=c(2,3))
qqplot(dbs_within_quant, bcr_within_quant, main="DBS-BCR")
abline(0,1)
qqplot(dbs_within_quant, bld_within_quant, main="DBS-BLD")
abline(0,1)
qqplot(dbs_within_quant, btb_within_quant, main="DBS-BTB")
abline(0,1)
qqplot(dbs_within_quant, sin_within_quant, main="DBS-SIN")
abline(0,1)
qqplot(dbs_within_quant, gnp_within_quant, main="DBS-GNP")
abline(0,1)
dev.off()
pdf("qqplot_highpopsize_across.pdf", width=10, height=8)
#across
par(mfrow=c(2,3))
qqplot(dbs_across_quant, bcr_across_quant, main="DBS-BCR")
abline(0,1)
qqplot(dbs_across_quant, bld_across_quant, main="DBS-BLD")
abline(0,1)
qqplot(dbs_across_quant, btb_across_quant, main="DBS-BTB")
abline(0,1)
qqplot(dbs_across_quant, sin_across_quant, main="DBS-SIN")
abline(0,1)
qqplot(dbs_across_quant, gnp_across_quant, main="DBS-GNP")
abline(0,1)
dev.off()
##trial plot
par(mfrow=c(2,2))
#within
qqnorm(dbs_within_quant,pch=1, main="DBS-within", ylim=c(-0.3,0.8))
qqline(dbs_within_quant,col="steelblue",lwd=2)
qqnorm(sin_within_quant,pch=1, main="SIN-within", ylim=c(-0.3,0.8))
qqline(sin_within_quant,col="steelblue",lwd=2)
#across
qqnorm(dbs_across_quant,pch=1, main="DBS-across", ylim=c(-0.3,0.8))
qqline(dbs_across_quant,col="steelblue",lwd=2)
qqnorm(sin_across_quant,pch=1, main="SIN-across", ylim=c(-0.3,0.8))
qqline(sin_across_quant,col="steelblue",lwd=2)
dev.copy(pdf,"qqplot_dbs_sin.pdf")
dev.off()
####### THIS IS EXTRA STUFF#############################
I used the flipped genotype files (flip_genEstmatrix_*) to run the LD estimates using the calcLD_scafs.R script which calculates estimates between and within scaffolds. The script is saved in the same folder.
Here is that script:
#read in the scaffold, position file
scafpos<-read.table("../../popanc/mappos.txt",sep=":",header=F)
#aims
scafpos<-read.table("./mappos3.txt",sep=":",header=F)
files= list.files(path=".", pattern = "flip_", full.names = TRUE)
#aims
files= list.files(path=".", pattern = "aims3_", full.names = TRUE)
## function to compute scaffold LD
calcLDdu = function(input){
pop<-read.table(input, header=F)
pop<-cbind(scafpos,pop) #N individuals (rows) X S snps (cols)
#print(pop)
#print(pop[,-c(1,2)])
#drop SNPs with MAF < 0.05
relevant_row_num = which((rowMeans(pop[,-c(1,2)], na.rm = T))/2 > 0.05)
#print(relevant_row_num)
pop_sub<-pop[relevant_row_num, ]
geno<-t(pop_sub) # snps are cols, individuals are rows, #scaf pos retained as col
#print(geno)
# Example: some positions are dropped
# 1 2 3 4 5 6 8 9 10 11
# scaffold 4 4 4 4 4 4 4 6 6 6
# position 10035 10038 10041 10054 10059 10066 21035 21041 21044 21048
# V1 0 NA 0 1 0 2 2 0 2 0
# V2 0 1 0 0 0 1 0 1 0 1
nsnps<-1:ncol(geno)
out_col = ncol(geno)
out_row = out_col
Scafmat<-matrix(NA, nrow=out_row, ncol=out_col)
Du<-matrix(NA, nrow=out_row, ncol=out_col)
for (i in 1:(out_col-1)){
for (j in 1:(out_col-1)){
if (i != j){
# Subset the col for computation
snp12<-geno[,c(i,j)]
scaf<-snp12[1, ]
if (scaf[1] == scaf[2]){
#print(scaf[1])
#print(scaf[2])
Scafmat[i,j]<- 1
}
else {
Scafmat[i,j]<- 0
}
#print(Scafmat)
#print(paste(" Scaff:", scaff_id, "Proc -> Mat(", i, j, ") S->(", scaf[1], scaf[2],"), P->(", pos[1], pos[2],")"))
tmp.geno <- snp12[!(is.na(snp12[,1])) & !(is.na(snp12[,2])),]
N <- dim(tmp.geno)[1]
pA <- (2*sum(tmp.geno[,1]==0) + sum(tmp.geno[,1]==1)) / (2*N)
pB <- (2*sum(tmp.geno[,2]==0) + sum(tmp.geno[,2]==1)) / (2*N)
pa <- 1-pA
pb <- 1-pB
# pAB <- (sum(tmp.geno[,1]==0 + tmp.geno[,2]==0)*2) + sum(tmp.geno[1,]==1 + tmp.geno[2,]==0) + sum(tmp.geno[1,]==0 + tmp.geno[2,]==1) + sum(tmp.geno[1,]==1 + tmp.geno[2,]==1) / (2*N)
nAABB = sum(tmp.geno[,1]==0 & tmp.geno[,2]==0) # 00
nAABb = sum(tmp.geno[,1]==0 & tmp.geno[,2]==1) # 01
nAaBB = sum(tmp.geno[,1]==1 & tmp.geno[,2]==0) # 10
nAaBb = sum(tmp.geno[,1]==1 & tmp.geno[,2]==1) # 11
###### calculate unbiased estimator D ##################
Du[i,j]<-(N / (N-1)) * (((4*nAABB + 2*(nAABb + nAaBB) + nAaBb)/ (2*N))-(2*pA*pB))
}
}
}
#get vectors to write out
ld<-Du[upper.tri(Du)]
#print(ld)
scafbin<-Scafmat[upper.tri(Scafmat)]
#print(scafbin)
ld_scaf<-cbind(scafbin,ld)
#print(ld_scaf)
index = which(files == input)
outname = paste('ldscaf', basename(input), sep = "_")
write.table(ld_scaf, file=outname, sep=" ", row.names=FALSE, quote=FALSE)
}
## apply the function for all the population files
for (i in 1:length(files)){
calcLDdu(files[i])
}
I repeated this for all SNPs and AIMS3. For all the SNPs the files were too big and I am not able to see the output plot.
############################### AIMS3 #########################################################################
Here is the codes for creating flipped files for AIMS3 SNPs:
#read in the mappos file
mp<-read.table("../../../popanc/mappos.txt",sep=":",header=F)
#loop for creating the subsetted files
files= list.files(path="../allsnps", pattern = "flip_genEstMatrix_", full.names = TRUE)
combine<-function(input){
pop<-read.table(input, header=T)
pop_mp<-cbind(mp,pop)
index = which(files == input)
outname = paste('mp', basename(input), sep = "_")
write.table(pop_mp, file=outname, sep=" ", col.names=TRUE, row.names=FALSE, quote=FALSE)
}
for (i in 1:length(files)){
combine(files[i])
}
## subset aims SNPs using python script
for f in mp_flip_genEstMatrix_*; do
python subset_aims_genoEst.py $f aims3_${f}; done
Run LD using calcLD_scafs.R for these output files now. These files are still saved in this folder: /uufs/chpc.utah.edu/common/home/u6007910/projects/lyc_dubois/pairwiseLD/polarizingLD/aims3
########################## plots ######################################################
##plot parents vs DBS
ld_dbs<-read.table("ldscaf_aims3_dbs.txt", header=T)
ld_cdy<-read.table("ldscaf_aims3_cdy.txt", header=T)
ld_ckv<-read.table("ldscaf_aims3_ckv.txt", header=T)
ld_sin<-read.table("ldscaf_aims3_sin.txt", header=T)
ld_bst<-read.table("ldscaf_aims3_bst.txt", header=T)
ld_khl<-read.table("ldscaf_aims3_khl.txt", header=T)
ld_sdc<-read.table("ldscaf_aims3_sdc.txt", header=T)
ld_syc<-read.table("ldscaf_aims3_syc.txt", header=T)
#dbs versus othe high pop size pops
ld_bcr<-read.table("ldscaf_aims3_bcr.txt", header=T)
ld_bld<-read.table("ldscaf_aims3_bld.txt", header=T)
ld_btb<-read.table("ldscaf_aims3_btb.txt", header=T)
ld_sin<-read.table("ldscaf_aims3_sin.txt", header=T)
ld_dbs<-read.table("ldscaf_aims3_dbs.txt", header=T)
pdf("ldscafs_dbs_parents.pdf", width = 10, height = 8)
par(mfrow=c(4,2))
par(mar=c(2,7,1,6.5))
boxplot(ld~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Between","Within"), main="DBS (N=115)", yaxt="n")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_cdy, ylab="Unbiased D", names=c("Between", "Within"), main="CDY (N=23)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_ckv, ylab="Unbiased D", names=c("Between", "Within"), main="CKV (N=10)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_sin, ylab="Unbiased D", names=c("Between","Within"), main="SIN (N=97)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_bst, ylab="Unbiased D", names=c("Between","Within"), main="BST (N=24)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_khl, ylab="Unbiased D", names=c("Between","Within"), main="KHL (N=18)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_sdc, ylab="Unbiased D", names=c("Between","Within"), main="SDC (N=20)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
boxplot(ld~scafbin, data=ld_syc, ylab="Unbiased D", names=c("Between","Within"), main="SYC (N=20)")
ytick<-seq(-0.2,0.6,by=0.1)
axis(side=2, at=ytick, labels = T)
dev.off()
## doing within and across plots separately
##within
dbs_within<-ld_dbs[which(ld_dbs$scafbin == 1),]
bcr_within<-ld_bcr[which(ld_bcr$scafbin == 1),]
bld_within<-ld_bld[which(ld_bld$scafbin == 1),]
btb_within<-ld_btb[which(ld_btb$scafbin == 1),]
sin_within<-ld_sin[which(ld_sin$scafbin == 1),]
bld_rep<-rep("bld", 40602)
btb_rep<-rep("btb", 44571)
sin_rep<-rep("sin", 18001)
bcr_rep<-rep("bcr", 43563)
dbs_rep<-rep("dbs", 41140)
dbs_within<-cbind(dbs_rep,dbs_within)
bcr_within<-cbind(bcr_rep,bcr_within)
bld_within<-cbind(bld_rep,bld_within)
btb_within<-cbind(btb_rep,btb_within)
sin_within<-cbind(sin_rep,sin_within)
colnames(dbs_within)<-c("pop","scaf","ld")
colnames(bcr_within)<-c("pop","scaf","ld")
colnames(bld_within)<-c("pop","scaf","ld")
colnames(btb_within)<-c("pop","scaf","ld")
colnames(sin_within)<-c("pop","scaf","ld")
all_within<-rbind(dbs_within,bcr_within,bld_within,btb_within,sin_within)
boxplot(ld~pop, data=all_within)
##across
dbs_across<-ld_dbs[which(ld_dbs$scafbin == 0),]
bcr_across<-ld_bcr[which(ld_bcr$scafbin == 0),]
bld_across<-ld_bld[which(ld_bld$scafbin == 0),]
btb_across<-ld_btb[which(ld_btb$scafbin == 0),]
sin_across<-ld_sin[which(ld_sin$scafbin == 0),]
dim(bld_across)
dim(btb_across)
dim(sin_across)
dim(bcr_across)
dim(dbs_across)
bld_rep<-rep("bld", 518551)
btb_rep<-rep("btb", 591057)
sin_rep<-rep("sin", 287370)
bcr_rep<-rep("bcr", 577492)
dbs_rep<-rep("dbs", 551276)
dbs_across<-cbind(dbs_rep,dbs_across)
bcr_across<-cbind(bcr_rep,bcr_across)
bld_across<-cbind(bld_rep,bld_across)
btb_across<-cbind(btb_rep,btb_across)
sin_across<-cbind(sin_rep,sin_across)
colnames(dbs_across)<-c("pop","scaf","ld")
colnames(bcr_across)<-c("pop","scaf","ld")
colnames(bld_across)<-c("pop","scaf","ld")
colnames(btb_across)<-c("pop","scaf","ld")
colnames(sin_across)<-c("pop","scaf","ld")
all_across<-rbind(dbs_across,bcr_across,bld_across,btb_across,sin_across)
boxplot(ld~pop, data=all_across)
#plot
par(mfrow=c(2,1))
boxplot(ld~pop, data=all_within, ylab="Unbiased D", names=c("dbs-115","bcr-46","bld-74","btb-46","sin-97"), main="Within")
boxplot(ld~pop, data=all_across, ylab="Unbiased D", names=c("dbs-115","bcr-46","bld-74","btb-46","sin-97"), main="Across")
dev.copy(pdf, "ldscafs_highpopsizes.pdf")
dev.off()
###plotting quantiles to compare DBS and SIN
#subset within and across
dbs_within<-ld_dbs[which(ld_dbs$scafbin == 1),]
dbs_across<-ld_dbs[which(ld_dbs$scafbin == 0),]
sin_within<-ld_sin[which(ld_sin$scafbin == 1),]
sin_across<-ld_sin[which(ld_sin$scafbin == 0),]
bld_within<-ld_bld[which(ld_bld$scafbin == 1),]
bld_across<-ld_bld[which(ld_bld$scafbin == 0),]
bcr_within<-ld_bcr[which(ld_bcr$scafbin == 1),]
bcr_across<-ld_bcr[which(ld_bcr$scafbin == 0),]
btb_within<-ld_btb[which(ld_btb$scafbin == 1),]
btb_across<-ld_btb[which(ld_btb$scafbin == 0),]
#take quantiles
#within
dbs_within_quant<-quantile(dbs_within$ld, probs=seq(0,1,0.01), na.rm=T)
sin_within_quant<-quantile(sin_within$ld, probs=seq(0,1,0.01), na.rm=T)
bcr_within_quant<-quantile(bcr_within$ld, probs=seq(0,1,0.01), na.rm=T)
bld_within_quant<-quantile(bld_within$ld, probs=seq(0,1,0.01), na.rm=T)
btb_within_quant<-quantile(btb_within$ld, probs=seq(0,1,0.01), na.rm=T)
#across
dbs_across_quant<-quantile(dbs_across$ld, probs=seq(0,1,0.01), na.rm=T)
sin_across_quant<-quantile(sin_across$ld, probs=seq(0,1,0.01), na.rm=T)
bcr_across_quant<-quantile(bcr_across$ld, probs=seq(0,1,0.01), na.rm=T)
bld_across_quant<-quantile(bld_across$ld, probs=seq(0,1,0.01), na.rm=T)
btb_across_quant<-quantile(btb_across$ld, probs=seq(0,1,0.01), na.rm=T)
## plot
#within
par(mfrow=c(2,4))
qqplot(dbs_within_quant, bcr_within_quant, main="DBS-BCR")
qqplot(dbs_within_quant, bld_within_quant, main="DBS-BLD")
qqplot(dbs_within_quant, btb_within_quant, main="DBS-BTB")
qqplot(dbs_within_quant, sin_within_quant, main="DBS-SIN")
#across
qqplot(dbs_across_quant, bcr_across_quant, main="DBS-BCR")
qqplot(dbs_across_quant, bld_across_quant, main="DBS-BLD")
qqplot(dbs_across_quant, btb_across_quant, main="DBS-BTB")
qqplot(dbs_across_quant, sin_across_quant, main="DBS-SIN")
dev.copy(pdf, "qqplot_highpopsize.pdf")
dev.off()