polarizing_LD

Folder for the analyses: /uufs/chpc.utah.edu/common/home/u6007910/projects/lyc_dubois/pairwiseLD/polarizingLD

Step 1: Flip the genotypes using the script polarizingLD_files.R saved in the same folder mentioned above. Below is the code in this script. I ran this script by submitting as a job using the bash script runPolarizingLD.R.

This step creates the files with flipped genotypes for each population. The files are named flip_genEstMatrix_*. I moved these files to the folder "allsnps".

######## getting allele frequency differences #############

### read in the AF files

idas_pure<-read.table("../../popanc/infiles/comb_idas_pure.txt", skip=1, header=F)

idas_pure_m<-as.matrix(idas_pure[,-1])

idas_pure_p<-apply(idas_pure_m,1,mean)/2

mel_sub<-read.table("../../popanc/infiles/comb_melissa_sub.txt", skip=1, header=F)

mel_sub_m<-as.matrix(mel_sub[,-1])

mel_sub_p<-apply(mel_sub_m,1,mean)/2

num<-c(1:39193)

### get the genotype files for reference ######

### aims 3 #####

gen_a3<-read.table("../../clines/difSnps30pct.txt", header=F)

gen_a3<-cbind(num,gen_a3)

gen_a3<-as.matrix(gen_a3)

### getting aims loci

afdiff<-idas_pure_p-mel_sub_p

#for loop to flip genotypes

#files= list.files(path="../../entropy/complete/mcmc", pattern = "genEstMatrix_", full.names = TRUE)

files= list.files(path="./aims3", pattern = "genEstMatrix_", full.names = TRUE)

flipgeno<-function(input){

pop<-read.table(input, header=T)

gen_af<-as.data.frame(cbind(afdiff,pop))

for(i in 1:nrow(gen_af)){

if(gen_af[,1][i] < 0){

#print(gen_af$gen[i])

gen_af[i,][2:ncol(gen_af)] <- 2-gen_af[i,][2:ncol(gen_af)]

index = which(files == input)

outname = paste('flip', basename(input), sep = "_")

write.table(gen_af[,-1], file=outname, sep=" ", row.names=FALSE, quote=FALSE)

}

for (i in 1:length(files)){

flipgeno(files[i])

}

Step 2: Calculate within and across scaffold LD for AIMS

I did this analyses in the folder: /uufs/chpc.utah.edu/common/home/u6007910/projects/lyc_dubois/pairwiseLD/polarizingLD/aims3/

1. Here is the code for creating flipped files for AIMS3 SNPs:

#In R

#read in the mappos file

mp<-read.table("../../../popanc/mappos.txt",sep=":",header=F)

#loop for creating the subsetted files

files= list.files(path="../allsnps", pattern = "flip_genEstMatrix_", full.names = TRUE)

combine<-function(input){

pop<-read.table(input, header=T)

pop_mp<-cbind(mp,pop)

index = which(files == input)

outname = paste('mp', basename(input), sep = "_")

write.table(pop_mp, file=outname, sep=" ", col.names=TRUE, row.names=FALSE, quote=FALSE)

}

for (i in 1:length(files)){

combine(files[i])

}

This will create files named: mp_flip_genEstMatrix_*

#In bash

## subset aims SNPs using python script

for f in mp_flip_genEstMatrix_*; do

python subset_aims_genoEst.py $f aims3_${f}; done

This will create files named: aims3_mp_flip_genEstMatrix_*

2. Run LD using calcLD_scafs.R on aims3_mp_flip_genEstMatrix* files. The script is saved in the "aims3" folder. I copy pasted the code in R and just ran it there directly. Here is the code:

#read in the mappos file

scafpos<-read.table("./mappos3.txt",header=F)

#aims

files= list.files(path=".", pattern = "aims3_mp_flip_genEstMatrix_", full.names = TRUE)

## function to compute scaffold LD

calcLDdu = function(input){

pop<-read.table(input, header=F)

pop<-cbind(scafpos,pop) #N individuals (rows) X S snps (cols)

#print(pop)

#print(pop[,-c(1,2)])

#drop SNPs with MAF < 0.05

#relevant_row_num = which((rowMeans(pop[,-c(1,2)], na.rm = T))/2 > 0.05)

#print(relevant_row_num)

#pop_sub<-pop[relevant_row_num, ]

geno<-t(pop) # snps are cols, individuals are rows, #scaf pos retained as col

#print(geno)

# Example: some positions are dropped

# 1 2 3 4 5 6 8 9 10 11

# scaffold 4 4 4 4 4 4 4 6 6 6

# position 10035 10038 10041 10054 10059 10066 21035 21041 21044 21048

# V1 0 NA 0 1 0 2 2 0 2 0

# V2 0 1 0 0 0 1 0 1 0 1

nsnps<-1:ncol(geno)

out_col = ncol(geno)

out_row = out_col

Scafmat<-matrix(NA, nrow=out_row, ncol=out_col)

Du<-matrix(NA, nrow=out_row, ncol=out_col)

for (i in 1:(out_col-1)){

for (j in i:(out_col-1)){

if (i != j){

# Subset the col for computation

snp12<-geno[,c(i,j)]

scaf<-snp12[1, ]

if (scaf[1] == scaf[2]){

#print(scaf[1])

#print(scaf[2])

Scafmat[i,j]<- 1

}

else {

Scafmat[i,j]<- 0

}

#print(Scafmat)

#print(paste(" Scaff:", scaff_id, "Proc -> Mat(", i, j, ") S->(", scaf[1], scaf[2],"), P->(", pos[1], pos[2],")"))

tmp.geno <- snp12[!(is.na(snp12[,1])) & !(is.na(snp12[,2])),]

N <- dim(tmp.geno)[1]

pA <- (2*sum(tmp.geno[,1]==0) + sum(tmp.geno[,1]==1)) / (2*N)

pB <- (2*sum(tmp.geno[,2]==0) + sum(tmp.geno[,2]==1)) / (2*N)

pa <- 1-pA

pb <- 1-pB

# pAB <- (sum(tmp.geno[,1]==0 + tmp.geno[,2]==0)*2) + sum(tmp.geno[1,]==1 + tmp.geno[2,]==0) + sum(tmp.geno[1,]==0 + tmp.geno[2,]==1) + sum(tmp.geno[1,]==1 + tmp.geno[2,]==1) / (2*N)

nAABB = sum(tmp.geno[,1]==0 & tmp.geno[,2]==0) # 00

nAABb = sum(tmp.geno[,1]==0 & tmp.geno[,2]==1) # 01

nAaBB = sum(tmp.geno[,1]==1 & tmp.geno[,2]==0) # 10

nAaBb = sum(tmp.geno[,1]==1 & tmp.geno[,2]==1) # 11

###### calculate unbiased estimator D ##################

Du[i,j]<-(N / (N-1)) * (((4*nAABB + 2*(nAABb + nAaBB) + nAaBb)/ (2*N))-(2*pA*pB))

}

#get vectors to write out

ld<-Du[upper.tri(Du)]

#print(ld)

scafbin<-Scafmat[upper.tri(Scafmat)]

#print(scafbin)

ld_scaf<-cbind(scafbin,ld)

#print(ld_scaf)

index = which(files == input)

outname = paste('ldscaf', basename(input), sep = "_")

write.table(ld_scaf, file=outname, sep=" ", row.names=FALSE, quote=FALSE)

}

## apply the function for all the population files

for (i in 1:length(files)){

calcLDdu(files[i])

}

3. Plot LD from the code in plotLDscafs.R

Here is the code:

##plot parents vs DBS

ld_dbs<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_DBS.txt", header=T)

ld_cdy<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_CDY.txt", header=T)

ld_ckv<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_CKV.txt", header=T)

ld_sin<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_SIN.txt", header=T)

ld_bst<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_BST.txt", header=T)

ld_khl<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_KHL.txt", header=T)

ld_sdc<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_SDC.txt", header=T)

ld_syc<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_SYC.txt", header=T)

#trial plot

par(mfrow=c(2,2))

boxplot(ld~scafbin, data=ld_bld, ylab="Unbiased D", names=c("Within","Between"), main="BLD (N=74)")

boxplot(ld~scafbin, data=ld_frc, ylab="Unbiased D", names=c("Within","Between"), main="FRC (N=20)")

boxplot(ld~scafbin, data=ld_ckv, ylab="Unbiased D", names=c("Within","Between"), main="CKV (N=10)")

boxplot(ld~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Within","Between"), main="DBS (N=115)")

par(mfrow=c(2,2))

boxplot(abs(ld)~scafbin, data=ld_bld, ylab="Unbiased D", names=c("Within","Between"), main="BLD (N=74)")

boxplot(abs(ld)~scafbin, data=ld_frc, ylab="Unbiased D", names=c("Within","Between"), main="FRC (N=20)")

boxplot(abs(ld)~scafbin, data=ld_ckv, ylab="Unbiased D", names=c("Within","Between"), main="CKV (N=10)")

boxplot(abs(ld)~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Within","Between"), main="DBS (N=115)")

#dbs versus other high pop size pops

ld_bcr<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_BCR.txt", header=T)

ld_bld<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_BLD.txt", header=T)

ld_btb<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_BTB.txt", header=T)

ld_gnp<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_GNP.txt", header=T)

ld_sin<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_SIN.txt", header=T)

ld_dbs<-read.table("ldscaf_aims3_mp_flip_genEstMatrix_DBS.txt", header=T)

pdf("ldscafs_dbs_parents.pdf", width = 10, height = 8)

par(mfrow=c(4,2))

par(mar=c(2,7,1,6.5))

boxplot(ld~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Between","Within"), main="DBS (N=115)", yaxt="n")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_cdy, ylab="Unbiased D", names=c("Between", "Within"), main="CDY (N=23)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_ckv, ylab="Unbiased D", names=c("Between", "Within"), main="CKV (N=10)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_sin, ylab="Unbiased D", names=c("Between","Within"), main="SIN (N=97)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_bst, ylab="Unbiased D", names=c("Between","Within"), main="BST (N=24)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_khl, ylab="Unbiased D", names=c("Between","Within"), main="KHL (N=18)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_sdc, ylab="Unbiased D", names=c("Between","Within"), main="SDC (N=20)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_syc, ylab="Unbiased D", names=c("Between","Within"), main="SYC (N=20)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

dev.off()

## dbs and high pops ##

pdf("ldscafs_dbs_highpopsize.pdf", width = 10, height = 8)

par(mfrow=c(2,3))

par(mar=c(2,7,1,6.5))

#dbs

boxplot(ld~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Between","Within"), main="DBS (N=115)", yaxt="n")

ytick<-seq(-0.2,0.4,by=0.2)

axis(side=2, at=ytick, labels = T)

#bcr

boxplot(ld~scafbin, data=ld_bcr, ylab="Unbiased D", names=c("Between","Within"), main="BCR (N=46)", yaxt="n")

ytick<-seq(-0.2,0.4,by=0.2)

axis(side=2, at=ytick, labels = T)

#bld

boxplot(ld~scafbin, data=ld_bld, ylab="Unbiased D", names=c("Between","Within"), main="BLD (N=74)", yaxt="n")

ytick<-seq(-0.2,0.4,by=0.2)

axis(side=2, at=ytick, labels = T)

#btb

boxplot(ld~scafbin, data=ld_btb, ylab="Unbiased D", names=c("Between","Within"), main="BTB (N=46)", yaxt="n")

ytick<-seq(-0.2,0.4,by=0.2)

axis(side=2, at=ytick, labels = T)

#gnp

boxplot(ld~scafbin, data=ld_sin, ylab="Unbiased D", names=c("Between","Within"), main="GNP (N=98)", yaxt="n")

ytick<-seq(-0.2,0.4,by=0.2)

axis(side=2, at=ytick, labels = T)

#sin

boxplot(ld~scafbin, data=ld_sin, ylab="Unbiased D", names=c("Between","Within"), main="SIN (N=97)", yaxt="n")

ytick<-seq(-0.2,0.4,by=0.2)

axis(side=2, at=ytick, labels = T)

dev.off()

#### plotting within and across scaffolds separately

##within

#subset rows with 1

dbs_within<-ld_dbs[which(ld_dbs$scafbin == 1),]

bcr_within<-ld_bcr[which(ld_bcr$scafbin == 1),]

bld_within<-ld_bld[which(ld_bld$scafbin == 1),]

btb_within<-ld_btb[which(ld_btb$scafbin == 1),]

gnp_within<-ld_gnp[which(ld_gnp$scafbin == 1),]

sin_within<-ld_sin[which(ld_sin$scafbin == 1),]

#create vectors

bld_rep<-rep("bld", dim(bld_within)[1])

btb_rep<-rep("btb", dim(btb_within)[1])

sin_rep<-rep("sin", dim(sin_within)[1])

bcr_rep<-rep("bcr", dim(bcr_within)[1])

gnp_rep<-rep("gnp", dim(gnp_within)[1])

dbs_rep<-rep("dbs", dim(dbs_within)[1])

#combine

dbs_within<-cbind(dbs_rep,dbs_within)

bcr_within<-cbind(bcr_rep,bcr_within)

bld_within<-cbind(bld_rep,bld_within)

btb_within<-cbind(btb_rep,btb_within)

sin_within<-cbind(sin_rep,sin_within)

gnp_within<-cbind(gnp_rep,gnp_within)

colnames(dbs_within)<-c("pop","scaf","ld")

colnames(bcr_within)<-c("pop","scaf","ld")

colnames(bld_within)<-c("pop","scaf","ld")

colnames(btb_within)<-c("pop","scaf","ld")

colnames(sin_within)<-c("pop","scaf","ld")

colnames(gnp_within)<-c("pop","scaf","ld")

#combine all the pops

all_within<-rbind(dbs_within,bcr_within,bld_within,btb_within,sin_within,gnp_within)

boxplot(ld~pop, data=all_within)

##across

#subset

dbs_across<-ld_dbs[which(ld_dbs$scafbin == 0),]

bcr_across<-ld_bcr[which(ld_bcr$scafbin == 0),]

bld_across<-ld_bld[which(ld_bld$scafbin == 0),]

btb_across<-ld_btb[which(ld_btb$scafbin == 0),]

gnp_across<-ld_gnp[which(ld_gnp$scafbin == 0),]

sin_across<-ld_sin[which(ld_sin$scafbin == 0),]

#vectors

bld_rep<-rep("bld", dim(bld_across)[1])

btb_rep<-rep("btb", dim(btb_across)[1])

sin_rep<-rep("sin", dim(sin_across)[1])

bcr_rep<-rep("bcr", dim(bcr_across)[1])

gnp_rep<-rep("gnp", dim(gnp_across)[1])

dbs_rep<-rep("dbs", dim(dbs_across)[1])

#combine

dbs_across<-cbind(dbs_rep,dbs_across)

bcr_across<-cbind(bcr_rep,bcr_across)

bld_across<-cbind(bld_rep,bld_across)

btb_across<-cbind(btb_rep,btb_across)

sin_across<-cbind(sin_rep,sin_across)

gnp_across<-cbind(gnp_rep,gnp_across)

colnames(dbs_across)<-c("pop","scaf","ld")

colnames(bcr_across)<-c("pop","scaf","ld")

colnames(bld_across)<-c("pop","scaf","ld")

colnames(btb_across)<-c("pop","scaf","ld")

colnames(sin_across)<-c("pop","scaf","ld")

colnames(gnp_across)<-c("pop","scaf","ld")

#combine pops

all_across<-rbind(dbs_across,bcr_across,bld_across,btb_across,sin_across,gnp_across)

boxplot(ld~pop, data=all_across)

#plot

pdf("ldscafs_highpopsizes.pdf", width=13, height=12)

par(mfrow=c(2,1))

boxplot(ld~pop, data=all_within, ylab="Unbiased D", names=c("dbs-115","bcr-46","bld-74","btb-46","sin-97","gnp-98"), main="Within")

boxplot(ld~pop, data=all_across, ylab="Unbiased D", names=c("dbs-115","bcr-46","bld-74","btb-46","sin-97","gnp-98"), main="Across")

dev.off()

###plotting quantiles to compare DBS and SIN

#subset within and across

dbs_within<-ld_dbs[which(ld_dbs$scafbin == 1),]

dbs_across<-ld_dbs[which(ld_dbs$scafbin == 0),]

sin_within<-ld_sin[which(ld_sin$scafbin == 1),]

sin_across<-ld_sin[which(ld_sin$scafbin == 0),]

bld_within<-ld_bld[which(ld_bld$scafbin == 1),]

bld_across<-ld_bld[which(ld_bld$scafbin == 0),]

bcr_within<-ld_bcr[which(ld_bcr$scafbin == 1),]

bcr_across<-ld_bcr[which(ld_bcr$scafbin == 0),]

btb_within<-ld_btb[which(ld_btb$scafbin == 1),]

btb_across<-ld_btb[which(ld_btb$scafbin == 0),]

gnp_within<-ld_gnp[which(ld_gnp$scafbin == 1),]

gnp_across<-ld_gnp[which(ld_gnp$scafbin == 0),]

#take quantiles

#within

dbs_within_quant<-quantile(dbs_within$ld, probs=seq(0,1,0.01), na.rm=T)

sin_within_quant<-quantile(sin_within$ld, probs=seq(0,1,0.01), na.rm=T)

bcr_within_quant<-quantile(bcr_within$ld, probs=seq(0,1,0.01), na.rm=T)

bld_within_quant<-quantile(bld_within$ld, probs=seq(0,1,0.01), na.rm=T)

btb_within_quant<-quantile(btb_within$ld, probs=seq(0,1,0.01), na.rm=T)

gnp_within_quant<-quantile(gnp_within$ld, probs=seq(0,1,0.01), na.rm=T)

#across

dbs_across_quant<-quantile(dbs_across$ld, probs=seq(0,1,0.01), na.rm=T)

sin_across_quant<-quantile(sin_across$ld, probs=seq(0,1,0.01), na.rm=T)

bcr_across_quant<-quantile(bcr_across$ld, probs=seq(0,1,0.01), na.rm=T)

bld_across_quant<-quantile(bld_across$ld, probs=seq(0,1,0.01), na.rm=T)

btb_across_quant<-quantile(btb_across$ld, probs=seq(0,1,0.01), na.rm=T)

gnp_across_quant<-quantile(gnp_across$ld, probs=seq(0,1,0.01), na.rm=T)

## plot

pdf("qqplot_highpopsize_within.pdf", width=10, height=8)

#within

par(mfrow=c(2,3))

qqplot(dbs_within_quant, bcr_within_quant, main="DBS-BCR")

abline(0,1)

qqplot(dbs_within_quant, bld_within_quant, main="DBS-BLD")

abline(0,1)

qqplot(dbs_within_quant, btb_within_quant, main="DBS-BTB")

abline(0,1)

qqplot(dbs_within_quant, sin_within_quant, main="DBS-SIN")

abline(0,1)

qqplot(dbs_within_quant, gnp_within_quant, main="DBS-GNP")

abline(0,1)

dev.off()

pdf("qqplot_highpopsize_across.pdf", width=10, height=8)

#across

par(mfrow=c(2,3))

qqplot(dbs_across_quant, bcr_across_quant, main="DBS-BCR")

abline(0,1)

qqplot(dbs_across_quant, bld_across_quant, main="DBS-BLD")

abline(0,1)

qqplot(dbs_across_quant, btb_across_quant, main="DBS-BTB")

abline(0,1)

qqplot(dbs_across_quant, sin_across_quant, main="DBS-SIN")

abline(0,1)

qqplot(dbs_across_quant, gnp_across_quant, main="DBS-GNP")

abline(0,1)

dev.off()

##trial plot

par(mfrow=c(2,2))

#within

qqnorm(dbs_within_quant,pch=1, main="DBS-within", ylim=c(-0.3,0.8))

qqline(dbs_within_quant,col="steelblue",lwd=2)

qqnorm(sin_within_quant,pch=1, main="SIN-within", ylim=c(-0.3,0.8))

qqline(sin_within_quant,col="steelblue",lwd=2)

#across

qqnorm(dbs_across_quant,pch=1, main="DBS-across", ylim=c(-0.3,0.8))

qqline(dbs_across_quant,col="steelblue",lwd=2)

qqnorm(sin_across_quant,pch=1, main="SIN-across", ylim=c(-0.3,0.8))

qqline(sin_across_quant,col="steelblue",lwd=2)

dev.copy(pdf,"qqplot_dbs_sin.pdf")

dev.off()

####### THIS IS EXTRA STUFF#############################

I used the flipped genotype files (flip_genEstmatrix_*) to run the LD estimates using the calcLD_scafs.R script which calculates estimates between and within scaffolds. The script is saved in the same folder.

Here is that script:

#read in the scaffold, position file

scafpos<-read.table("../../popanc/mappos.txt",sep=":",header=F)

#aims

scafpos<-read.table("./mappos3.txt",sep=":",header=F)

files= list.files(path=".", pattern = "flip_", full.names = TRUE)

#aims

files= list.files(path=".", pattern = "aims3_", full.names = TRUE)

## function to compute scaffold LD

calcLDdu = function(input){

pop<-read.table(input, header=F)

pop<-cbind(scafpos,pop) #N individuals (rows) X S snps (cols)

#print(pop)

#print(pop[,-c(1,2)])

#drop SNPs with MAF < 0.05

relevant_row_num = which((rowMeans(pop[,-c(1,2)], na.rm = T))/2 > 0.05)

#print(relevant_row_num)

pop_sub<-pop[relevant_row_num, ]

geno<-t(pop_sub) # snps are cols, individuals are rows, #scaf pos retained as col

#print(geno)

# Example: some positions are dropped

# 1 2 3 4 5 6 8 9 10 11

# scaffold 4 4 4 4 4 4 4 6 6 6

# position 10035 10038 10041 10054 10059 10066 21035 21041 21044 21048

# V1 0 NA 0 1 0 2 2 0 2 0

# V2 0 1 0 0 0 1 0 1 0 1

nsnps<-1:ncol(geno)

out_col = ncol(geno)

out_row = out_col

Scafmat<-matrix(NA, nrow=out_row, ncol=out_col)

Du<-matrix(NA, nrow=out_row, ncol=out_col)

for (i in 1:(out_col-1)){

for (j in 1:(out_col-1)){

if (i != j){

# Subset the col for computation

snp12<-geno[,c(i,j)]

scaf<-snp12[1, ]

if (scaf[1] == scaf[2]){

#print(scaf[1])

#print(scaf[2])

Scafmat[i,j]<- 1

}

else {

Scafmat[i,j]<- 0

}

#print(Scafmat)

#print(paste(" Scaff:", scaff_id, "Proc -> Mat(", i, j, ") S->(", scaf[1], scaf[2],"), P->(", pos[1], pos[2],")"))

tmp.geno <- snp12[!(is.na(snp12[,1])) & !(is.na(snp12[,2])),]

N <- dim(tmp.geno)[1]

pA <- (2*sum(tmp.geno[,1]==0) + sum(tmp.geno[,1]==1)) / (2*N)

pB <- (2*sum(tmp.geno[,2]==0) + sum(tmp.geno[,2]==1)) / (2*N)

pa <- 1-pA

pb <- 1-pB

# pAB <- (sum(tmp.geno[,1]==0 + tmp.geno[,2]==0)*2) + sum(tmp.geno[1,]==1 + tmp.geno[2,]==0) + sum(tmp.geno[1,]==0 + tmp.geno[2,]==1) + sum(tmp.geno[1,]==1 + tmp.geno[2,]==1) / (2*N)

nAABB = sum(tmp.geno[,1]==0 & tmp.geno[,2]==0) # 00

nAABb = sum(tmp.geno[,1]==0 & tmp.geno[,2]==1) # 01

nAaBB = sum(tmp.geno[,1]==1 & tmp.geno[,2]==0) # 10

nAaBb = sum(tmp.geno[,1]==1 & tmp.geno[,2]==1) # 11

###### calculate unbiased estimator D ##################

Du[i,j]<-(N / (N-1)) * (((4*nAABB + 2*(nAABb + nAaBB) + nAaBb)/ (2*N))-(2*pA*pB))

}

#get vectors to write out

ld<-Du[upper.tri(Du)]

#print(ld)

scafbin<-Scafmat[upper.tri(Scafmat)]

#print(scafbin)

ld_scaf<-cbind(scafbin,ld)

#print(ld_scaf)

index = which(files == input)

outname = paste('ldscaf', basename(input), sep = "_")

write.table(ld_scaf, file=outname, sep=" ", row.names=FALSE, quote=FALSE)

}

## apply the function for all the population files

for (i in 1:length(files)){

calcLDdu(files[i])

}

I repeated this for all SNPs and AIMS3. For all the SNPs the files were too big and I am not able to see the output plot.

############################### AIMS3 #########################################################################

Here is the codes for creating flipped files for AIMS3 SNPs:

#read in the mappos file

mp<-read.table("../../../popanc/mappos.txt",sep=":",header=F)

#loop for creating the subsetted files

files= list.files(path="../allsnps", pattern = "flip_genEstMatrix_", full.names = TRUE)

combine<-function(input){

pop<-read.table(input, header=T)

pop_mp<-cbind(mp,pop)

index = which(files == input)

outname = paste('mp', basename(input), sep = "_")

write.table(pop_mp, file=outname, sep=" ", col.names=TRUE, row.names=FALSE, quote=FALSE)

}

for (i in 1:length(files)){

combine(files[i])

}

## subset aims SNPs using python script

for f in mp_flip_genEstMatrix_*; do

python subset_aims_genoEst.py $f aims3_${f}; done

Run LD using calcLD_scafs.R for these output files now. These files are still saved in this folder: /uufs/chpc.utah.edu/common/home/u6007910/projects/lyc_dubois/pairwiseLD/polarizingLD/aims3

########################## plots ######################################################

##plot parents vs DBS

ld_dbs<-read.table("ldscaf_aims3_dbs.txt", header=T)

ld_cdy<-read.table("ldscaf_aims3_cdy.txt", header=T)

ld_ckv<-read.table("ldscaf_aims3_ckv.txt", header=T)

ld_sin<-read.table("ldscaf_aims3_sin.txt", header=T)

ld_bst<-read.table("ldscaf_aims3_bst.txt", header=T)

ld_khl<-read.table("ldscaf_aims3_khl.txt", header=T)

ld_sdc<-read.table("ldscaf_aims3_sdc.txt", header=T)

ld_syc<-read.table("ldscaf_aims3_syc.txt", header=T)

#dbs versus othe high pop size pops

ld_bcr<-read.table("ldscaf_aims3_bcr.txt", header=T)

ld_bld<-read.table("ldscaf_aims3_bld.txt", header=T)

ld_btb<-read.table("ldscaf_aims3_btb.txt", header=T)

ld_sin<-read.table("ldscaf_aims3_sin.txt", header=T)

ld_dbs<-read.table("ldscaf_aims3_dbs.txt", header=T)

pdf("ldscafs_dbs_parents.pdf", width = 10, height = 8)

par(mfrow=c(4,2))

par(mar=c(2,7,1,6.5))

boxplot(ld~scafbin, data=ld_dbs, ylab="Unbiased D", names=c("Between","Within"), main="DBS (N=115)", yaxt="n")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_cdy, ylab="Unbiased D", names=c("Between", "Within"), main="CDY (N=23)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_ckv, ylab="Unbiased D", names=c("Between", "Within"), main="CKV (N=10)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_sin, ylab="Unbiased D", names=c("Between","Within"), main="SIN (N=97)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_bst, ylab="Unbiased D", names=c("Between","Within"), main="BST (N=24)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_khl, ylab="Unbiased D", names=c("Between","Within"), main="KHL (N=18)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_sdc, ylab="Unbiased D", names=c("Between","Within"), main="SDC (N=20)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

boxplot(ld~scafbin, data=ld_syc, ylab="Unbiased D", names=c("Between","Within"), main="SYC (N=20)")

ytick<-seq(-0.2,0.6,by=0.1)

axis(side=2, at=ytick, labels = T)

dev.off()

## doing within and across plots separately

##within

dbs_within<-ld_dbs[which(ld_dbs$scafbin == 1),]

bcr_within<-ld_bcr[which(ld_bcr$scafbin == 1),]

bld_within<-ld_bld[which(ld_bld$scafbin == 1),]

btb_within<-ld_btb[which(ld_btb$scafbin == 1),]

sin_within<-ld_sin[which(ld_sin$scafbin == 1),]

bld_rep<-rep("bld", 40602)

btb_rep<-rep("btb", 44571)

sin_rep<-rep("sin", 18001)

bcr_rep<-rep("bcr", 43563)

dbs_rep<-rep("dbs", 41140)

dbs_within<-cbind(dbs_rep,dbs_within)

bcr_within<-cbind(bcr_rep,bcr_within)

bld_within<-cbind(bld_rep,bld_within)

btb_within<-cbind(btb_rep,btb_within)

sin_within<-cbind(sin_rep,sin_within)

colnames(dbs_within)<-c("pop","scaf","ld")

colnames(bcr_within)<-c("pop","scaf","ld")

colnames(bld_within)<-c("pop","scaf","ld")

colnames(btb_within)<-c("pop","scaf","ld")

colnames(sin_within)<-c("pop","scaf","ld")

all_within<-rbind(dbs_within,bcr_within,bld_within,btb_within,sin_within)

boxplot(ld~pop, data=all_within)

##across

dbs_across<-ld_dbs[which(ld_dbs$scafbin == 0),]

bcr_across<-ld_bcr[which(ld_bcr$scafbin == 0),]

bld_across<-ld_bld[which(ld_bld$scafbin == 0),]

btb_across<-ld_btb[which(ld_btb$scafbin == 0),]

sin_across<-ld_sin[which(ld_sin$scafbin == 0),]

dim(bld_across)

dim(btb_across)

dim(sin_across)

dim(bcr_across)

dim(dbs_across)

bld_rep<-rep("bld", 518551)

btb_rep<-rep("btb", 591057)

sin_rep<-rep("sin", 287370)

bcr_rep<-rep("bcr", 577492)

dbs_rep<-rep("dbs", 551276)

dbs_across<-cbind(dbs_rep,dbs_across)

bcr_across<-cbind(bcr_rep,bcr_across)

bld_across<-cbind(bld_rep,bld_across)

btb_across<-cbind(btb_rep,btb_across)

sin_across<-cbind(sin_rep,sin_across)

colnames(dbs_across)<-c("pop","scaf","ld")

colnames(bcr_across)<-c("pop","scaf","ld")

colnames(bld_across)<-c("pop","scaf","ld")

colnames(btb_across)<-c("pop","scaf","ld")

colnames(sin_across)<-c("pop","scaf","ld")

all_across<-rbind(dbs_across,bcr_across,bld_across,btb_across,sin_across)

boxplot(ld~pop, data=all_across)

#plot

par(mfrow=c(2,1))

boxplot(ld~pop, data=all_within, ylab="Unbiased D", names=c("dbs-115","bcr-46","bld-74","btb-46","sin-97"), main="Within")

boxplot(ld~pop, data=all_across, ylab="Unbiased D", names=c("dbs-115","bcr-46","bld-74","btb-46","sin-97"), main="Across")

dev.copy(pdf, "ldscafs_highpopsizes.pdf")

dev.off()

###plotting quantiles to compare DBS and SIN

#subset within and across

dbs_within<-ld_dbs[which(ld_dbs$scafbin == 1),]

dbs_across<-ld_dbs[which(ld_dbs$scafbin == 0),]

sin_within<-ld_sin[which(ld_sin$scafbin == 1),]

sin_across<-ld_sin[which(ld_sin$scafbin == 0),]

bld_within<-ld_bld[which(ld_bld$scafbin == 1),]

bld_across<-ld_bld[which(ld_bld$scafbin == 0),]

bcr_within<-ld_bcr[which(ld_bcr$scafbin == 1),]

bcr_across<-ld_bcr[which(ld_bcr$scafbin == 0),]

btb_within<-ld_btb[which(ld_btb$scafbin == 1),]

btb_across<-ld_btb[which(ld_btb$scafbin == 0),]

#take quantiles

#within

dbs_within_quant<-quantile(dbs_within$ld, probs=seq(0,1,0.01), na.rm=T)

sin_within_quant<-quantile(sin_within$ld, probs=seq(0,1,0.01), na.rm=T)

bcr_within_quant<-quantile(bcr_within$ld, probs=seq(0,1,0.01), na.rm=T)

bld_within_quant<-quantile(bld_within$ld, probs=seq(0,1,0.01), na.rm=T)

btb_within_quant<-quantile(btb_within$ld, probs=seq(0,1,0.01), na.rm=T)

#across

dbs_across_quant<-quantile(dbs_across$ld, probs=seq(0,1,0.01), na.rm=T)

sin_across_quant<-quantile(sin_across$ld, probs=seq(0,1,0.01), na.rm=T)

bcr_across_quant<-quantile(bcr_across$ld, probs=seq(0,1,0.01), na.rm=T)

bld_across_quant<-quantile(bld_across$ld, probs=seq(0,1,0.01), na.rm=T)

btb_across_quant<-quantile(btb_across$ld, probs=seq(0,1,0.01), na.rm=T)

## plot

#within

par(mfrow=c(2,4))

qqplot(dbs_within_quant, bcr_within_quant, main="DBS-BCR")

qqplot(dbs_within_quant, bld_within_quant, main="DBS-BLD")

qqplot(dbs_within_quant, btb_within_quant, main="DBS-BTB")

qqplot(dbs_within_quant, sin_within_quant, main="DBS-SIN")

#across

qqplot(dbs_across_quant, bcr_across_quant, main="DBS-BCR")

qqplot(dbs_across_quant, bld_across_quant, main="DBS-BLD")

qqplot(dbs_across_quant, btb_across_quant, main="DBS-BTB")

qqplot(dbs_across_quant, sin_across_quant, main="DBS-SIN")

dev.copy(pdf, "qqplot_highpopsize.pdf")

dev.off()

Page updated

Google Sites

Report abuse