Post date: Aug 17, 2018 12:36:45 AM
1. Dump hdf5 into plain text
/uufs/chpc.utah.edu/common/home/u0795641/projects/cmac_qtl/popgen
h5dump --dataset=g outp_bayes_L1.hdf5 > g_L1.txt
In hdf5 text:
:%s/:/,/g
:%s/(//
:%s/)//
:%s/,$//
2. Creating ID file
Copy ID line of cmacseries* to *_ids.txt:
:%s/ /\r/g
cut -f 1 -d " " cmacseries_L1.gl | perl -p -i -e 's/^(\S+)/\1/' > locusids1.txt
tail -n +4 locusids1.txt > locusids2.txt
sed 's/$/:/' locusids2.txt > locusids3.txt
perl -lne 'print $_, " ", $.' locusids3.txt > locusids4.txt
sed 's/ //g' locusids4.txt > locusids.txt
3. Create Locus IDs:
cut -f 1 -d " " cmacseries_L1.gl | perl -p -i -e 's/^(\S+)/\1/' > locusids1.txt
tail -n +4 locusids1.txt > locusids.txt
X.
Input files:
Formatted genotype file for Lentil references (g_L1ref.txt)
Formatted genotype file for Mung references (g_Mref.txt)
ID files for each
Scaffold:Position for SNPs in the Lentil and Mung references
Formatted genotype file for backcross
IDs for backcross individuals
locus IDs for backcross
Y<-matrix(scan("g_L1ref.txt",n=863640*6,sep=","),nrow=863640,ncol=6,byrow=TRUE)
M<-matrix(scan("g_Mref.txt",n=1036368*6,sep=","),nrow=1036368,ncol=6,byrow=TRUE)
mids<-read.table('Mref_ids.txt', header=FALSE)
l1ids<-read.table('L1ref_ids.txt', header=FALSE)
Y[,1:2]<-Y[,1:2]+1
for(i in 1:nrow(Y)){
if(Y[i,4]>=0.95){Y[i,3]<-0}
else{if(Y[i,5]>=0.95){Y[i,3]<-1}
else{if(Y[i,6]>=0.95){Y[i,3]<-2}
else{Y[i,3]<--9}}}
}
M[,1:2]<-M[,1:2]+1
for(i in 1:nrow(M)){
if(M[i,4]>=0.95){M[i,3]<-0}
else{if(M[i,5]>=0.95){M[i,3]<-1}
else{if(M[i,6]>=0.95){M[i,3]<-2}
else{M[i,3]<--9}}}
}
len1<-matrix(-7,ncol=1,nrow=34284)
len2<-matrix(-7,ncol=1,nrow=34284)
##-7 equals missing locus
refloci<-read.table('L123loci.txt', header=FALSE)
L1ref<-matrix(NA, ncol=21591+1, nrow=(40*2)+1)
Mref<-matrix(NA, ncol=21591+1, nrow=(48*2)+1)
mids<-read.table('Mref_ids.txt', header=FALSE)
l1ids<-read.table('L1ref_ids.txt', header=FALSE)
p<-seq(2,80,2)
q<-seq(3,81,2)
r<-seq(2,96,2)
s<-seq(3,97,2)
L1ref[p,1]<-as.character(l1ids[,1])
L1ref[q,1]<-as.character(l1ids[,1])
Mref[r,1]<-as.character(mids[,1])
Mref[s,1]<-as.character(mids[,1])
L1ref[1,2:21592]<-as.character(refloci[,1])
Mref[1,2:21592]<-as.character(refloci[,1])
L1dum<-matrix(-9, ncol=34284+1, nrow=(40*2))
Mdum<-matrix(-9, ncol=34284+1, nrow=(48*2))
p<-seq(1,80,2)
q<-seq(2,80,2)
r<-seq(1,96,2)
s<-seq(2,96,2)
L1dum[p,1]<-as.character(l1ids[,1])
L1dum[q,1]<-as.character(l1ids[,1])
Mdum[r,1]<-as.character(mids[,1])
Mdum[s,1]<-as.character(mids[,1])
p<-seq(2,80,2)
q<-seq(3,81,2)
r<-seq(2,96,2)
s<-seq(3,97,2)
for(i in 1:40){
#for individual 1, get all loci, then move to ind 2
locus<-seq(0+i,863640,40)
for(j in 1:21591){
if(Y[locus[j],3]==-9){L1ref[p[i],j+1]<--9;L1ref[q[i],j+1]<--9}
else{if(Y[locus[j],3]==0){L1ref[p[i],j+1]<-1;L1ref[q[i],j+1]<-1}
else{if(Y[locus[j],3]==1){L1ref[p[i],j+1]<-1;L1ref[q[i],j+1]<-2}
else{if(Y[locus[j],3]==2){L1ref[p[i],j+1]<-2;L1ref[q[i],j+1]<-2}}}}
}}
for(i in 1:48){
#for individual 1, get all loci, then move to ind 2
locus<-seq(0+i,1036368,48)
for(j in 1:21591){
if(M[locus[j],3]==-9){Mref[r[i],j+1]<--9;Mref[s[i],j+1]<--9}
else{if(M[locus[j],3]==0){Mref[r[i],j+1]<-1;Mref[s[i],j+1]<-1}
else{if(M[locus[j],3]==1){Mref[r[i],j+1]<-1;Mref[s[i],j+1]<-2}
else{if(M[locus[j],3]==2){Mref[r[i],j+1]<-2;Mref[s[i],j+1]<-2}}}}
}}
X<-matrix(scan("g_L1.txt",n=8262444*6,sep=","),nrow=8262444,ncol=6,byrow=TRUE)
X[,1:2]<-X[,1:2]+1
for(i in 1:nrow(X)){
if(X[i,4]>=0.95){X[i,3]<-0}
else{if(X[i,5]>=0.95){X[i,3]<-1}
else{if(X[i,6]>=0.95){X[i,3]<-2}
else{X[i,3]<--9}}}
}
loci<-read.table('locusids2.txt', header=FALSE)
L1pre<-matrix(NA, ncol=34284+1, nrow=(241*2)+1)
ids<- read.table('L1_ids.txt', header=FALSE,)
p<-seq(2,482,2)
q<-seq(3,483,2)
L1pre[p,1]<-as.character(ids[,1])
L1pre[q,1]<-as.character(ids[,1])
L1pre[1,2:34285]<-as.character(loci[,1])
L1<-rbind(L1pre,L1dum,Mdum)
pos<-which(loci[,1] %in% refloci[,1])
##484-563, L1
##564-659, M
L1[484:563,pos+1]<-L1ref[-1,-1]
L1[564:659,pos+1]<-Mref[-1,-1]
# -9=-9/-9, 0=1/1 ref/ref, 1=1/2, 2=2/2 alt/alt
for(i in 1:241){
#for individual 1, get all loci, then move to ind 2
locus<-seq(0+i,8262444,241)
for(j in 1:34284){
if(X[locus[j],3]==-9){L1[p[i],j+1]<--9;L1[q[i],j+1]<--9}
else{if(X[locus[j],3]==0){L1[p[i],j+1]<-1;L1[q[i],j+1]<-1}
else{if(X[locus[j],3]==1){L1[p[i],j+1]<-1;L1[q[i],j+1]<-2}
else{if(X[locus[j],3]==2){L1[p[i],j+1]<-2;L1[q[i],j+1]<-2}}}}
}}
loci2<-read.table('locusids2.txt', header=FALSE, sep=':')
for(i in 1:(nrow(loci2)-1)){
if(loci2[i,1]==loci2[i+1,1]){
loci2[i,3]<-loci2[i+1,2]-loci2[i,2]
} else{loci2[i,3]<--1}
}
## make last entry -1
loci2[34284,3]<--1
loci3<-rbind(c('','',''),loci2)
x<-L1[,2:34285][,order(loci2[,1])]
y<-cbind(L1[,1],x)
z<-rbind(head(y,1),loci3[,3],tail(y,-1))
z[1,1]<-''
popinfo<-c('','',rep(0,241*2),rep(1,40*2),rep(2,48*2))
final<-cbind(z[,1],popinfo,z[,-1])
write.table(final,'L1_struc_refs_in.txt', quote=FALSE, sep = ' ', na='NA', row.names=FALSE, col.names=FALSE)
#go in and delete first NA