To do:
1). Repeat for Trinity de novo.
#linux
I copied the transcript.gtf file from each samples folder to the main folder as samplenumber.gtf. I then used the following linux commands to create the final coverage and fpkm files for each sample. The temp folder in the main folder contains all the files below for each sample. I then moved the final fpkm and cov files to the main folder from the temp folder. Repeat the same for the trinity denovo reference.
#subset based on transcript in column 3
awk -F"\t" '$3 == "transcript" { print $1"\t"$3"\t"$9 }' KS001.gtf > KS001_transcripts.gtf
#get only FPKM values
grep -E -o "FPKM\s\"[0-9.]+\"" KS001_transcripts.gtf | grep -E -o "[0-9.]+" > KS001_fpkm.gtf
#get coverage column
grep -E -o "cov\s\"[0-9.]+\"" KS001_transcripts.gtf | grep -E -o "[0-9.]+" > KS001_cov.gtf
#get transcript id
grep -E -o "transcript_id\s\"[A-Z0-9.]+\"" KS001_transcripts.gtf | grep -E -o "[A-Z0-9.]+" > KS001_transid.gtf
#for loop for all files
for file in ./*.gtf; do
awk -F"\t" '$3 == "transcript" { print $1"\t"$3"\t"$9 }' $file > ${file}_transcripts \
cut -f 1 ${file}_transcripts > ${file}_scafpos \
grep -E -o "FPKM\s\"[0-9.]+\"" ${file}_transcripts | grep -E -o "[0-9.]+" > ${file}_fpkm \
grep -E -o "cov\s\"[0-9.]+\"" ${file}_transcripts | grep -E -o "[0-9.]+" > ${file}_cov \
grep -E -o "transcript_id\s\"[A-Z0-9.]+\"" ${file}_transcripts | grep -E -o "[A-Z0-9.]+" > ${file}_transid \
paste ${file}_scafpos ${file}_transid ${file}_fpkm > ${file}_final_fpkm \
paste ${file}_scafpos ${file}_transid ${file}_cov > ${file}_final_cov
done
###In R
library(dplyr)
library(tidyverse)
#read in to R
##### FPKM values ##################################
ks1<-read.table("KS001.gtf_final_fpkm", header=FALSE)
ks2<-read.table("KS002.gtf_final_fpkm", header=FALSE)
ks3<-read.table("KS003.gtf_final_fpkm", header=FALSE)
ks4<-read.table("KS004.gtf_final_fpkm", header=FALSE)
pmk1<-read.table("PMKS001.gtf_final_fpkm", header=FALSE)
pmk2<-read.table("PMKS002.gtf_final_fpkm", header=FALSE)
pmk3<-read.table("PMKS003.gtf_final_fpkm", header=FALSE)
pmk4<-read.table("PMKS004.gtf_final_fpkm", header=FALSE)
pmk5<-read.table("PMKS005.gtf_final_fpkm", header=FALSE)
pmk6<-read.table("PMKS006.gtf_final_fpkm", header=FALSE)
pmk7<-read.table("PMKS007.gtf_final_fpkm", header=FALSE)
pmk8<-read.table("PMKS008.gtf_final_fpkm", header=FALSE)
#subsetting based on transcript IDs. This function will find unique IDs and give NA for transcripts which are not found in the sample
all<-list(ks1,ks2,ks3,ks4, pmk1,pmk2,pmk3,pmk4,pmk5,pmk6,pmk7,pmk8) %>% reduce(full_join, by="V2")
head(all)
dim(all)
##get all the fpkm values columns
all_nums<-Filter(is.numeric,all) #dim = 51428R 12C
final<-cbind(all[,2],all_nums)
colnames(final)<-c("transcript_id","KS001","KS002","KS003","KS004","PMKS001","PMKS002","PMKS003","PMK004","PMKS005","PMKS006","PMKS007","PMKS008")
write.table(t(final),"fpkmvalues.txt", sep=" ", quote=FALSE, col.names=FALSE,row.names=TRUE)
################### coverage values #####################
#read in to R
ks1c<-read.table("KS001.gtf_final_cov", header=FALSE)
ks2c<-read.table("KS002.gtf_final_cov", header=FALSE)
ks3c<-read.table("KS003.gtf_final_cov", header=FALSE)
ks4c<-read.table("KS004.gtf_final_cov", header=FALSE)
pmk1c<-read.table("PMKS001.gtf_final_cov", header=FALSE)
pmk2c<-read.table("PMKS002.gtf_final_cov", header=FALSE)
pmk3c<-read.table("PMKS003.gtf_final_cov", header=FALSE)
pmk4c<-read.table("PMKS004.gtf_final_cov", header=FALSE)
pmk5c<-read.table("PMKS005.gtf_final_cov", header=FALSE)
pmk6c<-read.table("PMKS006.gtf_final_cov", header=FALSE)
pmk7c<-read.table("PMKS007.gtf_final_cov", header=FALSE)
pmk8c<-read.table("PMKS008.gtf_final_cov", header=FALSE)
#subsetting based on transcript IDs. This function will find unique IDs and give NA for transcripts which are not found in the sample
allcov<-list(ks1c,ks2c,ks3c,ks4c,pmk1c,pmk2c,pmk3c,pmk4c,pmk5c,pmk6c,pmk7c,pmk8c) %>% reduce(full_join, by="V2")
head(allcov)
dim(allcov)
##get all the fpkm values columns
allcov_nums<-Filter(is.numeric,allcov) #dim = 51428R 12C
final_cov<-cbind(allcov[,2],allcov_nums)
colnames(final_cov)<-c("transcript_id","KS001","KS002","KS003","KS004","PMKS001","PMKS002","PMKS003","PMK004","PMKS005","PMKS006","PMKS007","PMKS008")
write.table(t(final_cov),"coveragevalues.txt", sep=" ", quote=FALSE, col.names=FALSE,row.names=TRUE)