Data matching percentage check

(A1A2 check and flip)

email title: "data matching percentage check"

March 25th 2021 (date the final version of file was send out)

Location on server: /space/chen-syn01/1/data/cinliu/data/dat4Tyler

Goal:

We wanted to know if the how different old and the new data.

column by column check
A1 A2 match test

After check if A1A2 is not matching up we flip 0,1,2 -> 2,1,0

ClzTOP-dataTOP.RData" v.s "NEW_TOP_DATA.RData"):

39065 SNPs qualified for making the A1 A2 flip.

After the A1 A2 flip the perfect matches then become 43.87%

I will attach the summary text file in this email for more details.

summary.txt

Summary:

Percentage of matches (factoring out the extra FIDs in new data): 90.74

Percentage of matches (factoring out multiple factors)*: 95.1

Number of matches*: 85614177

Number of mismatches*: 4408173

*Note: the calculation factors out the information below:

FIDs removed from old data(ClzTOP-dataTOP.RData) because no matches were found: 53

FIDs removed from new data(NEW_TOP_DATA.RData) because no matches were found: 1096

SNPs removed old data(ClzTOP-dataTOP.RData) because no matches were found: 838

SNPs removed from new data(NEW_TOP_DATA.RData) because of duplication: 122

Columns listed below were also removed from old data(ClzTOP-dataTOP.RData):

y, gender, C1, C2, C3, C4

Columns listed below were also removed from new data(NEW_TOP_DATA.RData):

FID, PAT, MAT, SEX, PHENOTYPE

I've uploaded R codes used and the list of SNP names that had the 0,1,2 -> 2,1,0 into a onedrive folder here:

Matching_check

More information you might be interested in:

21 ambiguous SNPs were found in the new data set. All 21 SNPs are within in the 122 triallelic variant SNPS and were removed for this analysis.
No ambiguous SNPs found in the old data set (I think the previous post-doc already cleaned it).

For the "ClzTOP-dataClz.RData" (I scp from the server /space/syn09/1/data/nsanyal/GWASinlps/ClzTOP/ClzTOP_anal/ClzTOP-dataClz.RData) no FID matched with the new data so I couldn't run any analysis for it, so I guess in other words there was 0% match, or maybe Anu was referring to a different set of Clz data ?

flip_list_generated_and_ambig_check.R

(will not alter the data in anyways, just checks and print summary, also will generate a list of genes that qualify for flipping)

#ambigous SNP check & A1 A2 match/ flip need check #simply matching the new and old A1 and A2 with SNP names ##SNP names of all the old files are identical (detail check in ambig_flip_a1a2_check file)rm(list=ls())library(stringr)
dir <- "/Users/nini/Desktop/2021lab/Tyler/data/" #UPDATEsetwd(dir)
clozapine.minor.noambig <- read.csv("~/Desktop/2021lab/Tyler/clozapine.minor.noambig.dose", sep="")old_df <- clozapine.minor.noambig
new_dat <- readRDS(paste0(dir,"NEW_TOP_DATA.RData"))########## DATA CLEANING ##################n <- new_dat[-c(1:6)]#remove FID, PAT, MAT, PHENOTYPEnew_names <- colnames(n)nsplit <- str_split(new_names, ":")new_df <- as.data.frame(do.call(rbind,nsplit))#seperate the format A1:A2_A3colnames(new_df) <- c("CHR", "SNP","A1", "A2")new_df$A3 <- gsub(".*_","",new_df$A2) #A3 is the minor allelenew_df$A2 <- gsub("_.*","",new_df$A2) new_df$SNP <- paste0(new_df$CHR,":",new_df$SNP)##remove triallelic variant double <- which(duplicated(new_df$SNP) | duplicated(new_df$SNP, fromLast=TRUE))new_df<- new_df[-double,]#triallelic variant removed
#new dat eroredering A1 A2 #check if minor allele(A3) is A1, flip if no match and A2 is A3new_df_A1A3_flip_needed<- vector()count = 0for (i in 1:nrow(new_df)) { if (new_df$A1[i] != new_df$A3[i] & new_df$A2[i] == new_df$A3[i]) { #check if minor allele is not A1 then A1 and A2 will be wsitched count <- count +1 new_df_A1A3_flip_needed[i] <- i new_df$A2[i] <- new_df$A1[i] new_df$A1[i] <- new_df$A3[i] }}#new_df_A1A3_flip_needed <- new_df_A1A3_flip_needed[!is.na(new_df_A1A3_flip_needed)]#SNP_new_dat_flip <- new_df$SNP[new_df_A1A3_flip_needed]#SNP_new_dat_flip <- as.data.frame(SNP_new_dat_flip)#write.csv(SNP_new_dat_flip, "NEW_DAT_innerFlip_list.csv", row.names = FALSE)print(paste("new_df had",count,"SNPs where the minor allele was not A1, this has now been corrected!" ))#39042 did not have A1 as the minor allele, now corrected
#double check make sure identical(new_df$A1,new_df$A3) returns TRUEidentical(new_df$A1,new_df$A3)
########## DATA CLEANING END##################
####AMBIGUOUS SNPs CHECK ######ambig_check <- function(dat) { ALL_Am <- vector() for (i in 1:nrow(dat)) { if (dat$A1[i] == "A" & dat$A2[i] == "T" | dat$A1[i] == "T" & dat$A2[i] == "A" |dat$A1[i] == "C" & dat$A2[i] == "G" | dat$A1[i] == "G" & dat$A2[i] == "C"){ print(paste0(dat$CHR[i],":",dat$SNP[i],":",dat$A1[i],":",dat$A2[i],"_",dat$A3[i])) ALL_Am[i] <- i } } ALL_Am <- ALL_Am[!is.na(ALL_Am)] #remove the NAs return(ALL_Am) #return the row number of the ambiguous SNP in the data }newdf_am_snp <- ambig_check(new_df) #no amb snp after removing doubles (122 double snp removed)d_am_snp <- ambig_check(clozapine.minor.noambig) #none ####AMBIGUOUS SNPs CHECK END ######

####A1A2 MATCH CHECK #####nogo <- which(is.na(match(old_df$SNP,new_df$SNP)), arr.ind=TRUE)#old_new_SNP_mismatch <- length(nogo) #4189 SNPs don't match up in old vs new DFold_df <- old_df[-nogo,]#reorder SNPnew_df$order <- match(new_df$SNP,old_df$SNP) new_df <- new_df[order(new_df$order),]rownames(new_df) <- new_df$SNPnew_df <- new_df[-length(new_df)] #take out the last row of order labeling #double check identical(new_df$SNP,old_df$SNP)
####### check for diff between A1 in the old vs new #A1 no match A1mismatch_rownum <- vector() count =0 for (i in 1:nrow(new_df)) { if (all.equal(new_df$A1[i],old_df$A1[i]) != TRUE ){ count <- count +1 #print(paste(i,new_df$A1[i],old_df$A1[i])) A1mismatch_rownum[i] <- i } }countA1mismatch_rownum <- A1mismatch_rownum[!is.na(A1mismatch_rownum)] #remove the NAs
#double check #A1 no match &A2 not match A1noA2no <- vector() count =0for (i in A1mismatch_rownum) { if (all.equal(new_df$A2[i],old_df$A1[i]) != TRUE){#cross check done, 38497 == TRUE count <- count +1 #print(paste(i,new_df$A2[i],old_df$A1[i])) A1noA2no[i] <- i }}A1noA2no <- A1noA2no[!is.na(A1noA2no)] count#if count = 0 and A1noA2no is logical(0), means all the SNPs where A1 does not match with A1, is a match with A2, #safe for flip
#conclusion: only keeping SNPs where A1 = A2 & A2=A1SNP_name_needflip <- old_df$SNP[A1mismatch_rownum]write.csv(SNP_name_needflip, "Flip_list.csv", row.names = FALSE)
###DOUBLE CHECK - reverse check ########check for A1 = A1 and A2 = A2A1same <- vector() count =0for (i in 1:nrow(new_df)) { if (all.equal(new_df$A1[i],old_df$A1[i]) == TRUE ){ count <- count +1 #print(paste(i,new_df$A1[i],old_df$A1[i])) A1same[i] <- i }}countA1same <- A1same[!is.na(A1same)] #remove the NAs###A2notsame <- vector() count =0for (i in A1same) { if (all.equal(new_df$A2[i],old_df$A2[i]) != TRUE ){ count <- count +1 #print(paste(i,new_df$A2[i],old_df$A2[i])) A2notsame[i] <- i }}#double check count #if count = 0 mean all of A1 = A2 matchlength(A1mismatch_rownum)+length(A1same) == nrow(new_df) #count check

match_check.R

(this is where we do the 0,1,2 flip at d1[,i] <- 2- d1[,i]

rm(list=ls())library(dplyr)
dir <- "/Users/nini/Desktop/2021lab/Tyler/data/" #UPDATEsetwd(dir)
#Nini note: #not sure why there is a warning when trying to import #using load() function so had to use readRDSdat <- readRDS(paste0(dir,"ClzTOP-dataTOP.RData")) new_dat <- readRDS(paste0(dir,"NEW_TOP_DATA.RData"))
####ROWS#####change new name to match old names format (I am making assumptions the name remains the same after these changes)new_dat$IID <- gsub("FAM001_","",new_dat$IID)new_dat$IID <- gsub("\\_.*","",new_dat$IID)
d <- dat[-c(2:7)] #remove y, C1, C2, C3, C4, gendercount_col_old_removed <- colnames(dat)[2:7]n <- new_dat[-c(1,3,4,5,6)]#remove FID, PAT, MAT, PHENOTYPE, SEXcount_col_new_removed <- colnames(new_dat)[c(1,3,4,5,6)]
#no match in the subset , there's 53 NAs no ID matcch for old in new nogo <- which(is.na(match(dat$FID,new_dat$IID)), arr.ind=TRUE) #no match in the subset count_old_FID_remove <- length(nogo) #for summaryd <- d[-nogo,]#1096 no match for the big list (need to eliminate from big list)nogo <- which(is.na(match(new_dat$IID,dat$FID)), arr.ind=TRUE)count_new_FID_remove <- length(nogo) #for summaryn <- n[-nogo,]#double checking measure:##summary(match(n$IID,d$FID)) #there should be no NAs for the IID and FIDs any more, meaning the IID and FID now match perfectly
####Reorder rows####n$order <- match(n$IID,d$FID)n <- n[order(n$order),]n <- n[-length(n)] #take out the last row of order labeling rownames(n) <- n$IIDn <- n[-1] #remove the id column now that it is the rownamesd <- d[-1] #remove the id column now that it is the rownames
######COLUMNS######
#change new name to match old names format (I am making assumptions the name remains the same after these changes)colnames(n) <- gsub("*:[A-Z]:[A-Z]_[A-Z]", "", colnames(n)) #remove letter formatcolnames(n) <- gsub(":",".",colnames(n))colnames(n) <- paste0("X", colnames(n), sep = "")
double <- which(duplicated(colnames(n)) | duplicated(colnames(n), fromLast=TRUE))count_new_SNP_removed <- length(double)n1 <- n[-double]#write it into 'n1' instead of 'n' to avoid a warning about initializationremove(n)
#clean after double removed nogo <- which(is.na(match(colnames(d),colnames(n1))), arr.ind=TRUE) #no match in the subset count_old_SNP_removed <- length(nogo)d <- d[-nogo]d1 <- as.data.frame(sapply(d, as.integer)) #convert to intiger to match 'n' dataframerownames(d1) <- rownames(d)
##########load list of SNP that need to be flip because A1 = A2(or A3)Flip_list <- read.csv("~/Desktop/2021lab/Tyler/data/Flip_list.csv")
colnames(Flip_list) <- c("SNP")Flip_list$SNP <- gsub(":",".",Flip_list$SNP)Flip_list$SNP <- paste0("X", Flip_list$SNP, sep = "")
#summary(match(Flip_list$SNP,colnames(d1))) #39 no matches flip_col <- match(Flip_list$SNP,colnames(d1))flip_col <- flip_col[!is.na(flip_col)]for (i in flip_col) { d1[,i] <- 2- d1[,i]}

###testing #####
identical(colnames(n1),colnames(d1))identical(rownames(n1),rownames(d1))
my_list <- list() # Create empty listmy_list
total = 0count =0for (k in 1:length(d1)) { count =0 for (i in 1:nrow(d1)) { if (all.equal(n1[,k][i],d1[,k][i]) != TRUE){ count <- count +1 } } j <- print(paste(k,count)) my_list[k] <- j total = total +count}total
df <- gsub("[0-9]* ","",my_list)df <- as.data.frame(df)rownames(df) <- colnames(d)colnames(df) <- "mismatch_count"df$mismatch_count <- as.numeric(df$mismatch_count)write.csv(df,"mismatch_count_per_row.csv", row.names = TRUE)
#mismatch percentageA <- nrow(d1)*length(d1)
perfect_matches = A- totalmismatch_percentage = (total/A)*100perfect_match_percentage = 100 - mismatch_percentage
#Factoring everything in the percetage of overall perfect matchperfect_match_percentage_all = (perfect_matches/(nrow(dat)*length(dat)))*100
fileConn<-file("summary.txt")writeLines(c("Percentage of matches (factoring out the extra FIDs in new data):", round(perfect_match_percentage_all,2), "Percentage of matches (factoring out multiple factors)*:", round(perfect_match_percentage,2), "Number of matches*:",perfect_matches, "Number of mismatches*:",total, " ", "*Note: the calculation factors out the information below:", "FIDs removed from old data(ClzTOP-dataTOP.RData) because no matches were found:",count_old_FID_remove, "FIDs removed from new data(NEW_TOP_DATA.RData) because no matches were found:",count_new_FID_remove, "SNPs removed old data(ClzTOP-dataTOP.RData) because no matches were found:",count_old_SNP_removed, "SNPs removed from new data(NEW_TOP_DATA.RData) because of duplication:",count_new_SNP_removed, " ", "Columns listed below were also removed from old data(ClzTOP-dataTOP.RData):",count_col_old_removed, "Columns listed below were also removed from new data(NEW_TOP_DATA.RData):", count_col_new_removed), fileConn)close(fileConn)