Merge SNPmarker

Slack CH April 7th 2021

locatino on server: /space/chen-syn01/1/data/cinliu/data/extract_clean_SNPID

Location on server: /space/chen-syn01/1/data/cinliu/data/extract_clean_SNPID

Original instructions

Hi Nini, could you help merge all the files in this folder and give me a file with the second column only (SNP list)?

"/space/chen-syn01/1/data/haowang/InvHap/SNPmarker"

Files given

18.txt files

Each file looks similar to this:

Draft 1:

SNPmarker.R

rm(list=ls())

#only extract column 2

dir <- "/Users/nini/Desktop/2021lab/CH/SNPmarker/" #scp from "/space/chen-syn01/1/data/haowang/InvHap/SNPmarker"

setwd(dir)

files <- list.files(path = paste0(dir,"SNPmarker/"))

files[1]

df <- read.delim(paste0(dir,"SNPmarker/",files[1]))

df <- df[,2]

count <- 0

for (i in 2:length(files)) {

df2 <- read.delim(paste0(dir,"SNPmarker/",files[i]))

print(nrow(df2))

df2 <- df2[,2]

df <- c(df,df2)

}

class(df)

df <- as.data.frame(df)

colnames(df) <- "SNP"

write.table(df, "SNPmarker_2ndColumnOnly.txt",quote = FALSE,row.names = FALSE, col.names = TRUE)

File attached: SNPmarker_2ndColumnOnly.txt

Note: for 2 of the files the names are given in dbSNP instead of rsSNP format, let me know if you’d like me to do something about it ?

Draft 2 (ie final copy):

Changes needed to be made:

Files “UKB_17q21.31.txt” and “UKB_6p21.33.txt" names are given in dbSNP instead of rsSNP format. We need to do ID conversion.
- The reference genome version is GRC37 (also known as hg19)
- Possible tools CH suggested might be helpful:

- https://www.biostars.org/p/171557/

- https://www.biostars.org/p/160302/

- https://support.bioconductor.org/p/49400/

Make ID the common identifier. Then use the merge function to merge and indicate ID as the common factor.

2.SNPmarker_SNPname_only.R

rm(list=ls())

#only extract column 2

dir <- "/Users/nini/Desktop/2021lab/CH/SNPmarker/" #scp from "/space/chen-syn01/1/data/haowang/InvHap/SNPmarker"

setwd(dir)

files <- list.files(path = paste0(dir,"SNPmarker/"))

files[1]

df <- read.delim(paste0(dir,"SNPmarker/",files[1]))

df <- df[,2]

count <- 0

for (i in 2:length(files)) {

df2 <- read.delim(paste0(dir,"SNPmarker/",files[i]))

print(nrow(df2))

df2 <- df2[,2]

df <- c(df,df2)

}

class(df)

df <- as.data.frame(df)

colnames(df) <- "SNP"

write.table(df, "SNPmarker_2ndColumnOnly_update.txt",quote = FALSE,row.names = FALSE, col.names = TRUE)

rsSNP_to_dbSNP.R (this is like step 3)

rm(list=ls())#only extract column 2 dir <- "/Users/nini/Desktop/2021lab/CH/SNPmarker/" #scp from "/space/chen-syn01/1/data/haowang/InvHap/SNPmarker"setwd(dir)# scp from /space/chen-syn01/1/data/haowang/InvHap/SNPmarkerfiles <- list.files(path = paste0(dir,"SNPmarker/"))
#files[6] & files[12]df <- read.delim(paste0(dir,"SNPmarker/",files[1]))df$dbsnp <- paste0(df[,1],":",df[,3])df <-df$dbsnp

count <- 0for (i in 2:length(files)) { df2 <- read.delim(paste0(dir,"SNPmarker/",files[i])) print(nrow(df2)) df2$dbsnp <- paste0(df2[,1],":",df2[,3]) df2 <-df2$dbsnp df <- c(df,df2)}class(df)df <- as.data.frame(df)colnames(df) <- "SNP"write.table(df, "dbSNPlist.txt",quote = FALSE,row.names = FALSE, col.names = TRUE)

Full code:

1.snp_name_correction.R

#convert SNP name of UKB_6p21.33.txt and UKB_17q21.31.txt from dbSNP to rsSNPrm(list = ls())setwd("/Users/nini/Desktop/2021lab/CH/SNPmarker/")#the following 2 lists are downloded from: https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=1085830827_zqGJ4lo10PrgO0vu1FtiMHC7v3km&clade=mammal&org=Human&db=hg19&hgta_group=varRep&hgta_track=snp138&hgta_table=0&hgta_regionType=range&position=chr6%3A29%2C600%2C000-29%2C800%2C000&hgta_outputType=selectedFields&hgta_outFileName=6SNPs_151.txt#instructions followed from: https://www.biostars.org/p/171557/six_ch <- read.delim("/Users/nini/Desktop/2021lab/CH/SNPmarker/chr6.txt")seventeen_ch <- read.delim("/Users/nini/Desktop/2021lab/CH/SNPmarker/chr17.txt")
#ch6dat <- six_ch[,2:3]dat$chromEnd <- paste0("6:",dat$chromEnd)colnames(dat) <- c("dbSNP", "rsSNP")#write.table(dat, "chrom6.txt",quote = FALSE,row.names = FALSE, col.names = FALSE)UKB_6p21.33 <- read.delim("/Users/nini/Desktop/2021lab/CH/SNPmarker/SNPmarker/UKB_6p21.33.txt")UKB_6p21.33$SNP <- gsub("_.*","",UKB_6p21.33$SNP)match(UKB_6p21.33$SNP, dat$dbSNP)conversion_ch6 <- dat[match(UKB_6p21.33$SNP, dat$dbSNP),]write.table (conversion_ch6, "chrom6_conversion_list.txt",quote = FALSE, row.names = FALSE, sep="\t")UKB_6p21.33$SNP <- dat$rsSNP[match(UKB_6p21.33$SNP, dat$dbSNP)]#Warning: the file original file will be over written on the next linewrite.table(UKB_6p21.33, "/Users/nini/Desktop/2021lab/CH/SNPmarker/SNPmarker/UKB_6p21.33.txt", quote = FALSE,row.names = FALSE,sep="\t")
#ch17dat <- seventeen_ch[,2:3]dat$chromEnd <- paste0("17:",dat$chromEnd)colnames(dat) <- c("dbSNP", "rsSNP")#write.table(dat, "chrom6.txt",quote = FALSE,row.names = FALSE, col.names = FALSE)UKB_17q21.31 <- read.delim("/Users/nini/Desktop/2021lab/CH/SNPmarker/SNPmarker/UKB_17q21.31.txt")UKB_17q21.31$SNP <- gsub("_.*","",UKB_17q21.31$SNP)match(UKB_17q21.31$SNP, dat$dbSNP)conversion_ch17 <- dat[match(UKB_17q21.31$SNP, dat$dbSNP),]write.table(conversion_ch17, "chrom17_conversion_list.txt",quote = FALSE, row.names = FALSE,sep="\t")UKB_17q21.31$SNP <- dat$rsSNP[match(UKB_17q21.31$SNP, dat$dbSNP)]#Warning: the file original file will be over written on the next linewrite.table(UKB_17q21.31, "/Users/nini/Desktop/2021lab/CH/SNPmarker/SNPmarker/UKB_17q21.31.txt",quote = FALSE,row.names = FALSE,sep="\t")

For a more cleaned up version: /space/chen-syn01/1/data/cinliu/data/extract_clean_SNPID/SNPmarker

It is basically doing the same thing just with simplified steps.

Page updated

Report abuse