BCH 390P Project - Project Codes in R

Project Codes in R

Length, Overlap, Linkage Equilibrium

library(tidyverse)

# Load necessary library

library(dplyr)

library(tidyr)

# Assuming combined_df is your data frame with the X column

# Extract the first column 'X' and split it into 'chromosome', 'start', and 'end'

combined_df <- combined_df %>%

select(X) %>%

separate(X, into = c("chromosome", "start", "end"), sep = "-")

# Convert start and end positions to numeric

combined_df$start <- as.numeric(combined_df$start)

combined_df$end <- as.numeric(combined_df$end)

# Calculate the base pair length

combined_df <- combined_df %>% mutate(length_bp = end - start)

# View the updated dataframe with the length column

print(combined_df)

# Load necessary library

library(ggplot2)

# Create a histogram of the base pair lengths

ggplot(combined_df, aes(x = length_bp)) +

geom_histogram(binwidth = 50, fill = "blue", color = "black", alpha = 0.7) +

labs(title = "Distribution of Base Pair Lengths",

x = "Base Pair Length",

y = "Count") + # Y-axis as count

theme_minimal()

# Create a histogram of the base pair lengths with custom x-axis increments

ggplot(combined_df, aes(x = length_bp)) +

geom_histogram(binwidth = 50, fill = "blue", color = "black", alpha = 0.7) +

labs(title = "Distribution of Base Pair Lengths",

x = "Base Pair Length",

y = "Count") + # Y-axis as count

scale_x_continuous(breaks = seq(0, max(combined_df$length_bp, na.rm = TRUE), by = 100)) + # Custom increments on x-axis

theme_minimal()

# Create a new column that groups lengths greater than 1500

combined_df <- combined_df %>%

mutate(length_bp_grouped = ifelse(length_bp > 1500, 1500, length_bp))

# Create a histogram using the grouped lengths

ggplot(combined_df, aes(x = length_bp_grouped)) +

geom_histogram(binwidth = 50, fill = "blue", color = "black", alpha = 0.7) +

labs(title = "Distribution of Base Pair Lengths",

x = "Base Pair Length",

y = "Count") +

scale_x_continuous(breaks = seq(0, 1600, by = 100), labels = c(seq(0, 1500, by = 100), "1500+")) + # Group all values > 1500 together

theme_minimal()

# Create a BED format dataframe from the enhancer coordinates in combined_df

bed_df <- combined_df %>%

select(chromosome, start, end) %>%

mutate(chromosome = paste0("", chromosome)) # Ensure the "chr" prefix is included for each chromosome

# Save the BED file (tab-separated format)

write.table(bed_df, "mouse_enhancers.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

library(GenomicRanges)

library(rtracklayer)

library(dplyr)

gwas_data <- read.delim("gwas-association-Obesity-withChildTraits.tsv", sep = "\t", stringsAsFactors = FALSE)

# Step 2: Clean the GWAS data (extract relevant columns)

# We need the CHR_ID (chromosome), CHR_POS (position), and we will create start and end positions by adding a 50bp window

# Convert CHR_POS to numeric and remove rows with missing CHR_POS

gwas_data <- gwas_data %>%

filter(!is.na(CHR_POS)) %>%

mutate(CHR_POS = as.numeric(CHR_POS),

start = CHR_POS - 50,

end = CHR_POS + 50)

# Step 3: Create a GRanges object for the GWAS loci

gwas_gr <- GRanges(seqnames = gwas_data$CHR_ID,

ranges = IRanges(start = gwas_data$start, end = gwas_data$end))

seqlevels(gwas_gr) <- paste0("chr", seqlevels(gwas_gr))

# Step 4: Load the human enhancers BED file

library(GenomicRanges)

# Load the human enhancer data (BED file) and create a GRanges object

# Load the human enhancer data (BED file) and assign meaningful column names

human_coords_df <- read.delim("hglft_genome_268e8_b45c80.bed", header = FALSE)

# Assign proper column names based on BED format

colnames(human_coords_df) <- c("chrom", "start", "end", "name", "score")

# Convert the human coordinates data frame to GRanges object

human_enhancers <- GRanges(

seqnames = human_coords_df$chrom, # Chromosome

ranges = IRanges(start = human_coords_df$start, end = human_coords_df$end), # Start and end positions

strand = "*" # No strand information in the BED file

)

# Step 5: Find overlaps between enhancers and GWAS loci

overlaps <- findOverlaps(human_enhancers, gwas_gr)

overlaps

# Extract the overlapping regions

overlapping_enhancers <- human_enhancers[queryHits(overlaps)]

overlapping_gwas <- gwas_gr[subjectHits(overlaps)]

# Step 6: Combine results into a data frame for inspection or saving

overlap_df <- data.frame(

enhancer_chr = seqnames(overlapping_enhancers),

enhancer_start = start(overlapping_enhancers),

enhancer_end = end(overlapping_enhancers),

gwas_chr = seqnames(overlapping_gwas),

gwas_start = start(overlapping_gwas),

gwas_end = end(overlapping_gwas)

)

# View the first few rows of the overlaps

head(overlap_df)

# Optionally save the overlapping regions to a file

write.table(overlap_df, "enhancer_gwas_overlaps.txt", sep = "\t", row.names = FALSE, quote = FALSE)

# Step 1: Load the chain file for liftover (human to mouse)

chain_file <- "hg38ToMm10.over.chain"

chain <- import.chain(chain_file)

# Step 2: Create a GRanges object for the human enhancer (chr14: 72225811-72226177)

human_olenhancer1 <- GRanges(seqnames = "chr11",

ranges = IRanges(start = 8640252, end = 8640969))

# Step 3: Perform the liftover from human to mouse

mouse_enhancer <- liftOver(human_olenhancer1, chain)

# Step 4: Convert the result to a data frame to inspect the mouse coordinates

mouse_enhancer_df <- as.data.frame(mouse_enhancer)

# View the converted mouse enhancer coordinates

mouse_enhancer_df

human_olenhancer2 <- GRanges(seqnames = "chr6",

ranges = IRanges(start = 97128328, end = 97128951))

# Step 3: Perform the liftover from human to mouse

mouse_enhancer2 <- liftOver(human_olenhancer2, chain)

# Step 4: Convert the result to a data frame to inspect the mouse coordinates

mouse_enhancer_df2 <- as.data.frame(mouse_enhancer2)

# View the converted mouse enhancer coordinates

mouse_enhancer_df2

# Assuming overlap_df contains the human enhancer overlaps

# Select relevant columns for the BED file (chr, start, end)

overlap_bed <- overlap_df[, c("enhancer_chr", "enhancer_start", "enhancer_end")]

# Rename the columns to match BED format

colnames(overlap_bed) <- c("chr", "start", "end")

# Save the BED file

write.table(overlap_bed, "human_enhancer_overlaps.bed", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)

# Load the lifted mouse coordinates from LiftOver

lifted_mouse_coords <- read.table("hglft_genome_Obesity.bed", header = FALSE)

colnames(lifted_mouse_coords) <- c("chrom", "start", "end") # Assign column names

# Assuming combined_df contains the original mouse enhancer data

# Find overlaps between lifted mouse coordinates and the original mouse data

library(GenomicRanges)

# Convert the original mouse data (combined_df) to a GRanges object

original_mouse_gr <- GRanges(

seqnames = combined_df$chrom, # Chromosome column in original data

ranges = IRanges(start = combined_df$start, end = combined_df$end), # Start and end positions

strand = "*" # Strand information

)

# Convert the lifted mouse coordinates to a GRanges object

lifted_mouse_gr <- GRanges(

seqnames = lifted_mouse_coords$chrom, # Chromosome

ranges = IRanges(start = lifted_mouse_coords$start, end = lifted_mouse_coords$end), # Start and end positions

strand = "*"

)

# Find overlaps between the lifted-over mouse regions and the original mouse data

overlaps_back_to_mouse <- findOverlaps(lifted_mouse_gr, original_mouse_gr)

# Extract the overlapping original mouse regions

matched_mouse_enhancers <- original_mouse_gr[subjectHits(overlaps_back_to_mouse)]

# View the matched mouse enhancers

head(matched_mouse_enhancers)

combined_df1 <- bind_rows(df1, df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,df17,df18,df19,df20,df21,df22,df23,df24,df25,df26,df27,df29,df30,df31,df32,df33,df34,df35,df36,df37)

combined_df1 <- combined_df1 %>%

select(X,source) %>%

separate(X, into = c("chromosome", "start", "end"), sep = "-")

# Convert start and end positions to numeric

combined_df1$start <- as.numeric(combined_df1$start)

combined_df1$end <- as.numeric(combined_df1$end)

combined_df4<-filter(combined_df1,chromosome=="chr4")

combined_df7<-filter(combined_df1,chromosome=="chr7")

# Load the TSV file from the GWAS Catalog

gwas_snp_data <- read.delim("gwas-association-Obesity-withChildTraits.tsv", sep = "\t")

# View the first few rows of the dataset to check the structure

head(gwas_snp_data)

# Extract the column with SNP IDs (e.g., 'SNPS' column)

snps_of_interest <- gwas_snp_data$SNPS

# Remove any missing values (optional but useful)

snps_of_interest <- na.omit(snps_of_interest)

# View the first few SNPs to confirm

head(snps_of_interest)

# Load the LDlinkR package

library(LDlinkR)

# Set your API token directly in the function call (replace with your actual token)

api_token <- "0650be540fe7"

# Perform LD analysis for each SNP in the list using the token

ld_results <- lapply(snps_of_interest, function(snp) {

tryCatch({

# Query LDproxy to get SNPs in LD with the current SNP

ld_data <- LDproxy(snp = snp, pop = "EUR", r2d = "r2", token = api_token)

# Return the LD data

return(ld_data)

}, error = function(e) {

# Handle any errors (e.g., SNP not found or API limits)

message(paste("Error with SNP:", snp, ":", e$message))

return(NULL)

})

# Check the first LD result

ld_results[[1]]

# Save the list to an RData file

save(ld_results, file = "ld_results.RData")

# Load it back in another session

load("ld_results.RData")

# Convert list of results to a data frame (assuming the data in ld_results is compatible)

ld_df <- do.call(rbind, lapply(ld_results, as.data.frame))

# Write the data frame to a CSV file

write.csv(ld_df, "ld_results.csv", row.names = FALSE)

# Inspect the structure of the first result

str(ld_results[[1]])

# View the first few rows of the first result

head(ld_results[[1]])

# Combine all LD results into a single data frame

combined_ld_results <- do.call(rbind, ld_results)

# View the combined data

head(combined_ld_results)

# Filter SNPs with strong LD (e.g., r² > 0.8)

strong_ld_snps <- subset(combined_ld_results, R2 > 0.8)

# View the filtered SNPs

head(strong_ld_snps)

# Check the column names in the strong_ld_snps data frame

colnames(strong_ld_snps)

library(ggplot2)

# Plot the distribution of r² values

ggplot(combined_ld_results, aes(x = R2)) +

geom_histogram(binwidth = 0.05, fill = "blue", color = "black") +

labs(title = "Distribution of r² Values in LD Results",

x = "r²",

y = "Count") +

theme_minimal()

# Save the combined LD results to a CSV file

write.csv(combined_ld_results, "combined_ld_results.csv", row.names = FALSE)

# Save the filtered strong LD SNPs to a CSV file

write.csv(strong_ld_snps, "strong_ld_snps.csv", row.names = FALSE)

library(GenomicRanges)

# Assuming combined_ld_results is your full dataset

# Extract chromosome and start positions from the Coord column

combined_ld_results$chrom <- gsub(":.*", "", combined_ld_results$Coord) # Extract chromosome

combined_ld_results$start <- as.numeric(gsub(".*:", "", combined_ld_results$Coord)) # Extract start position

# For SNPs, start and end positions are typically the same

combined_ld_results$end <- combined_ld_results$start

combined_ld_results

# Check for missing values in the start and end columns

sum(is.na(combined_ld_results$start)) # Count missing start positions

sum(is.na(combined_ld_results$end)) # Count missing end positions

# Remove rows with NA in start or end positions

filtered_ld_results <- combined_ld_results[!is.na(combined_ld_results$start) &

!is.na(combined_ld_results$end), ]

# Create a GRanges object from the filtered dataset

ld_snps_gr <- GRanges(

seqnames = filtered_ld_results$chrom, # Chromosome

ranges = IRanges(start = filtered_ld_results$start, end = filtered_ld_results$end), # Start and end positions

strand = "*", # No strand info for SNPs

RS_Number = filtered_ld_results$RS_Number, # Add SNP IDs

R2 = filtered_ld_results$R2 # Add R2 values

)

# Check the GRanges object

ld_snps_gr

# Perform the overlap between LD SNPs and human enhancers

LDoverlaps <- findOverlaps(ld_snps_gr, human_enhancers)

# Extract the overlapping LD SNPs and enhancers

overlapping_ld_snps <- ld_snps_gr[queryHits(LDoverlaps)]

overlapping_enhancers_LD <- human_enhancers[subjectHits(LDoverlaps)]

# Create a data frame of the overlapping results

LDoverlap_df <- data.frame(

query_snp = overlapping_ld_snps$RS_Number,

enhancer_chr = seqnames(overlapping_enhancers_LD),

enhancer_start = start(overlapping_enhancers_LD),

enhancer_end = end(overlapping_enhancers_LD),

ld_snp_r2 = overlapping_ld_snps$R2 # Include R2 value for the LD SNPs

)

LDoverlap_df

# View the first few rows of the overlap results

head(LDoverlap_df)

# Count the number of LD SNPs per enhancer

ld_snp_counts_per_enhancer <- LDoverlap_df %>%

group_by(enhancer_chr, enhancer_start, enhancer_end) %>%

summarise(n_ld_snps = n())

# View the SNP counts per enhancer

print(ld_snp_counts_per_enhancer,n=500)

# Save the overlap results to a CSV file

write.csv(LDoverlap_df, "ld_human_enhancer_overlaps.csv", row.names = FALSE)

# Save the SNP counts per enhancer

write.csv(ld_snp_counts_per_enhancer, "ld_snp_counts_per_enhancer.csv", row.names = FALSE)

str(ld_results)

MPRA Library Design

# Load necessary libraries

library(tidyverse)

library(GenomicRanges)

library(Biostrings)

# Load combined_file.csv

combined_for_tile <- read.csv("combined_file.csv")

# Remove duplicate entries based on the "X" column

combined_for_tile <- combined_for_tile %>% distinct(X, .keep_all = TRUE)

# Separate genomic coordinates

TileDf <- combined_for_tile %>%

dplyr::select(X) %>%

tidyr::separate(X, into = c("chromosome", "start", "end"), sep = "-")

TileDf$start <- as.numeric(TileDf$start)

TileDf$end <- as.numeric(TileDf$end)

TileDf <- TileDf %>% mutate(length_bp = end - start)

# 2. Split into Large and Small Regions

large_regions <- TileDf %>% filter(length_bp > 270)

small_regions <- TileDf %>% filter(length_bp <= 270)

# 3. Create GRanges for Large Regions

granges_large <- GRanges(

seqnames = large_regions$chromosome,

ranges = IRanges(start = large_regions$start, end = large_regions$end)

)

# 4. Tiling Large Regions (270 bp, 30 bp overlap)

tile_width <- 270

overlap_size <- 30

tiled_granges_list <- list()

for (i in seq_along(granges_large)) {

current_range <- granges_large[i]

tiles <- slidingWindows(current_range, width = tile_width, step = tile_width - overlap_size)

tiled_granges_list[[i]] <- tiles[[1]]

}

tiled_granges <- do.call(c, tiled_granges_list)

# 5. Load Genome

genome <- readDNAStringSet("Mus_musculus.GRCm38.dna.primary_assembly.fa.gz")

# Helper: Extract correct chromosome name from genome names

get_chr_name <- function(chr) {

match_chr <- grep(paste0("^", chr, " "), names(genome), value = TRUE)

if (length(match_chr) == 0) stop(paste("Chromosome", chr, "not found"))

return(match_chr)

}

# Helper: Extract genomic sequence

extract_sequence <- function(chr, start, end) {

chr_name <- get_chr_name(chr)

chr_seq <- genome[[chr_name]]

return(subseq(chr_seq, start = start, end = end))

}

# 6. Extract and Filter Large Tiled Sequences by GC Content

calculate_gc_content <- function(seq) {

gc_count <- sum(letterFrequency(DNAString(seq), letters = c("G", "C")))

return((gc_count / nchar(seq)) * 100)

}

seqlevels(tiled_granges) <- gsub("^chr", "", seqlevels(tiled_granges))

# Extract tiled sequences

tiled_sequences <- sapply(seq_along(tiled_granges), function(i) {

chr <- as.character(seqnames(tiled_granges[i]))

start <- start(tiled_granges[i])

end <- end(tiled_granges[i])

as.character(extract_sequence(chr, start, end))

})

# Filter by GC content <= 60%

tiled_gc <- sapply(tiled_sequences, calculate_gc_content)

filtered_tiled_sequences <- tiled_sequences[tiled_gc <= 60]

filtered_tiled_granges <- tiled_granges[tiled_gc <= 60]

# 7. Extract and Filter Small Sequences by GC Content

small_regions$chromosome <- gsub("^chr", "", small_regions$chromosome)

small_sequences <- sapply(seq_along(small_regions$chromosome), function(i) {

chr <- small_regions$chromosome[i]

start <- small_regions$start[i]

end <- small_regions$end[i]

as.character(extract_sequence(chr, start, end))

})

small_gc <- sapply(small_sequences, calculate_gc_content)

filtered_small_sequences <- small_sequences[small_gc <= 60]

filtered_small_regions <- small_regions[small_gc <= 60, ]

# 8. Add Adaptors

adaptor_left <- "AGGACCGGATCAACT"

adaptor_right <- "CATTGCGTGAACCGA"

# Add adaptors to sequences

final_tiled_seq <- paste0(adaptor_left, filtered_tiled_sequences, adaptor_right)

final_small_seq <- paste0(adaptor_left, filtered_small_sequences, adaptor_right)

# FASTA headers

tiled_headers <- paste0(">", seqnames(filtered_tiled_granges), ":", start(filtered_tiled_granges), "-", end(filtered_tiled_granges))

small_headers <- paste0(">", filtered_small_regions$chromosome, ":", filtered_small_regions$start, "-", filtered_small_regions$end)

# 9. Write FASTA File

final_fasta_lines <- c(

paste0(tiled_headers, "\n", final_tiled_seq),

paste0(small_headers, "\n", final_small_seq)

)

writeLines(final_fasta_lines, "filtered_tiled_and_small_enhancers.fa")

# 10. Count X and Y Chromosome Entries

tiled_chr <- as.character(seqnames(filtered_tiled_granges))

small_chr <- as.character(filtered_small_regions$chromosome)

xy_tiled_count <- sum(tiled_chr %in% c("X", "Y"))

xy_small_count <- sum(small_chr %in% c("X", "Y"))

total_xy_count <- xy_tiled_count + xy_small_count

total_tiled_count <- length(tiled_chr)

total_small_count <- nrow(filtered_small_regions)

total_combined <- total_tiled_count + total_small_count

cat("X/Y large tiled enhancers:", xy_tiled_count, "\n")

cat("X/Y small enhancers:", xy_small_count, "\n")

cat("Total X/Y enhancers:", total_xy_count, "\n")

cat("Total large tiled:", total_tiled_count, "\n")

cat("Total small:", total_small_count, "\n")

cat("Total combined:", total_combined, "\n")

All analyses were performed using R Statistical Software (v4.1.2; R Core Team 2021).

Page updated

Report abuse