library(tidyverse)
# Load necessary library
library(dplyr)
library(tidyr)
# Assuming combined_df is your data frame with the X column
# Extract the first column 'X' and split it into 'chromosome', 'start', and 'end'
combined_df <- combined_df %>%
select(X) %>%
separate(X, into = c("chromosome", "start", "end"), sep = "-")
# Convert start and end positions to numeric
combined_df$start <- as.numeric(combined_df$start)
combined_df$end <- as.numeric(combined_df$end)
# Calculate the base pair length
combined_df <- combined_df %>% mutate(length_bp = end - start)
# View the updated dataframe with the length column
print(combined_df)
# Load necessary library
library(ggplot2)
# Create a histogram of the base pair lengths
ggplot(combined_df, aes(x = length_bp)) +
geom_histogram(binwidth = 50, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Base Pair Lengths",
x = "Base Pair Length",
y = "Count") + # Y-axis as count
theme_minimal()
# Create a histogram of the base pair lengths with custom x-axis increments
ggplot(combined_df, aes(x = length_bp)) +
geom_histogram(binwidth = 50, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Base Pair Lengths",
x = "Base Pair Length",
y = "Count") + # Y-axis as count
scale_x_continuous(breaks = seq(0, max(combined_df$length_bp, na.rm = TRUE), by = 100)) + # Custom increments on x-axis
theme_minimal()
# Create a new column that groups lengths greater than 1500
combined_df <- combined_df %>%
mutate(length_bp_grouped = ifelse(length_bp > 1500, 1500, length_bp))
# Create a histogram using the grouped lengths
ggplot(combined_df, aes(x = length_bp_grouped)) +
geom_histogram(binwidth = 50, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Base Pair Lengths",
x = "Base Pair Length",
y = "Count") +
scale_x_continuous(breaks = seq(0, 1600, by = 100), labels = c(seq(0, 1500, by = 100), "1500+")) + # Group all values > 1500 together
theme_minimal()
# Create a BED format dataframe from the enhancer coordinates in combined_df
bed_df <- combined_df %>%
select(chromosome, start, end) %>%
mutate(chromosome = paste0("", chromosome)) # Ensure the "chr" prefix is included for each chromosome
# Save the BED file (tab-separated format)
write.table(bed_df, "mouse_enhancers.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
library(GenomicRanges)
library(rtracklayer)
library(dplyr)
gwas_data <- read.delim("gwas-association-Obesity-withChildTraits.tsv", sep = "\t", stringsAsFactors = FALSE)
# Step 2: Clean the GWAS data (extract relevant columns)
# We need the CHR_ID (chromosome), CHR_POS (position), and we will create start and end positions by adding a 50bp window
# Convert CHR_POS to numeric and remove rows with missing CHR_POS
gwas_data <- gwas_data %>%
filter(!is.na(CHR_POS)) %>%
mutate(CHR_POS = as.numeric(CHR_POS),
start = CHR_POS - 50,
end = CHR_POS + 50)
# Step 3: Create a GRanges object for the GWAS loci
gwas_gr <- GRanges(seqnames = gwas_data$CHR_ID,
ranges = IRanges(start = gwas_data$start, end = gwas_data$end))
seqlevels(gwas_gr) <- paste0("chr", seqlevels(gwas_gr))
# Step 4: Load the human enhancers BED file
library(GenomicRanges)
# Load the human enhancer data (BED file) and create a GRanges object
# Load the human enhancer data (BED file) and assign meaningful column names
human_coords_df <- read.delim("hglft_genome_268e8_b45c80.bed", header = FALSE)
# Assign proper column names based on BED format
colnames(human_coords_df) <- c("chrom", "start", "end", "name", "score")
# Convert the human coordinates data frame to GRanges object
human_enhancers <- GRanges(
seqnames = human_coords_df$chrom, # Chromosome
ranges = IRanges(start = human_coords_df$start, end = human_coords_df$end), # Start and end positions
strand = "*" # No strand information in the BED file
)
# Step 5: Find overlaps between enhancers and GWAS loci
overlaps <- findOverlaps(human_enhancers, gwas_gr)
overlaps
# Extract the overlapping regions
overlapping_enhancers <- human_enhancers[queryHits(overlaps)]
overlapping_gwas <- gwas_gr[subjectHits(overlaps)]
# Step 6: Combine results into a data frame for inspection or saving
overlap_df <- data.frame(
enhancer_chr = seqnames(overlapping_enhancers),
enhancer_start = start(overlapping_enhancers),
enhancer_end = end(overlapping_enhancers),
gwas_chr = seqnames(overlapping_gwas),
gwas_start = start(overlapping_gwas),
gwas_end = end(overlapping_gwas)
)
# View the first few rows of the overlaps
head(overlap_df)
# Optionally save the overlapping regions to a file
write.table(overlap_df, "enhancer_gwas_overlaps.txt", sep = "\t", row.names = FALSE, quote = FALSE)
# Step 1: Load the chain file for liftover (human to mouse)
chain_file <- "hg38ToMm10.over.chain"
chain <- import.chain(chain_file)
# Step 2: Create a GRanges object for the human enhancer (chr14: 72225811-72226177)
human_olenhancer1 <- GRanges(seqnames = "chr11",
ranges = IRanges(start = 8640252, end = 8640969))
# Step 3: Perform the liftover from human to mouse
mouse_enhancer <- liftOver(human_olenhancer1, chain)
# Step 4: Convert the result to a data frame to inspect the mouse coordinates
mouse_enhancer_df <- as.data.frame(mouse_enhancer)
# View the converted mouse enhancer coordinates
mouse_enhancer_df
human_olenhancer2 <- GRanges(seqnames = "chr6",
ranges = IRanges(start = 97128328, end = 97128951))
# Step 3: Perform the liftover from human to mouse
mouse_enhancer2 <- liftOver(human_olenhancer2, chain)
# Step 4: Convert the result to a data frame to inspect the mouse coordinates
mouse_enhancer_df2 <- as.data.frame(mouse_enhancer2)
# View the converted mouse enhancer coordinates
mouse_enhancer_df2
# Assuming overlap_df contains the human enhancer overlaps
# Select relevant columns for the BED file (chr, start, end)
overlap_bed <- overlap_df[, c("enhancer_chr", "enhancer_start", "enhancer_end")]
# Rename the columns to match BED format
colnames(overlap_bed) <- c("chr", "start", "end")
# Save the BED file
write.table(overlap_bed, "human_enhancer_overlaps.bed", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
# Load the lifted mouse coordinates from LiftOver
lifted_mouse_coords <- read.table("hglft_genome_Obesity.bed", header = FALSE)
colnames(lifted_mouse_coords) <- c("chrom", "start", "end") # Assign column names
# Assuming combined_df contains the original mouse enhancer data
# Find overlaps between lifted mouse coordinates and the original mouse data
library(GenomicRanges)
# Convert the original mouse data (combined_df) to a GRanges object
original_mouse_gr <- GRanges(
seqnames = combined_df$chrom, # Chromosome column in original data
ranges = IRanges(start = combined_df$start, end = combined_df$end), # Start and end positions
strand = "*" # Strand information
)
# Convert the lifted mouse coordinates to a GRanges object
lifted_mouse_gr <- GRanges(
seqnames = lifted_mouse_coords$chrom, # Chromosome
ranges = IRanges(start = lifted_mouse_coords$start, end = lifted_mouse_coords$end), # Start and end positions
strand = "*"
)
# Find overlaps between the lifted-over mouse regions and the original mouse data
overlaps_back_to_mouse <- findOverlaps(lifted_mouse_gr, original_mouse_gr)
# Extract the overlapping original mouse regions
matched_mouse_enhancers <- original_mouse_gr[subjectHits(overlaps_back_to_mouse)]
# View the matched mouse enhancers
head(matched_mouse_enhancers)
combined_df1 <- bind_rows(df1, df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,df17,df18,df19,df20,df21,df22,df23,df24,df25,df26,df27,df29,df30,df31,df32,df33,df34,df35,df36,df37)
combined_df1 <- combined_df1 %>%
select(X,source) %>%
separate(X, into = c("chromosome", "start", "end"), sep = "-")
# Convert start and end positions to numeric
combined_df1$start <- as.numeric(combined_df1$start)
combined_df1$end <- as.numeric(combined_df1$end)
combined_df4<-filter(combined_df1,chromosome=="chr4")
combined_df7<-filter(combined_df1,chromosome=="chr7")
# Load the TSV file from the GWAS Catalog
gwas_snp_data <- read.delim("gwas-association-Obesity-withChildTraits.tsv", sep = "\t")
# View the first few rows of the dataset to check the structure
head(gwas_snp_data)
# Extract the column with SNP IDs (e.g., 'SNPS' column)
snps_of_interest <- gwas_snp_data$SNPS
# Remove any missing values (optional but useful)
snps_of_interest <- na.omit(snps_of_interest)
# View the first few SNPs to confirm
head(snps_of_interest)
# Load the LDlinkR package
library(LDlinkR)
# Set your API token directly in the function call (replace with your actual token)
api_token <- "0650be540fe7"
# Perform LD analysis for each SNP in the list using the token
ld_results <- lapply(snps_of_interest, function(snp) {
tryCatch({
# Query LDproxy to get SNPs in LD with the current SNP
ld_data <- LDproxy(snp = snp, pop = "EUR", r2d = "r2", token = api_token)
# Return the LD data
return(ld_data)
}, error = function(e) {
# Handle any errors (e.g., SNP not found or API limits)
message(paste("Error with SNP:", snp, ":", e$message))
return(NULL)
})
})
# Check the first LD result
ld_results[[1]]
# Save the list to an RData file
save(ld_results, file = "ld_results.RData")
# Load it back in another session
load("ld_results.RData")
# Convert list of results to a data frame (assuming the data in ld_results is compatible)
ld_df <- do.call(rbind, lapply(ld_results, as.data.frame))
# Write the data frame to a CSV file
write.csv(ld_df, "ld_results.csv", row.names = FALSE)
# Inspect the structure of the first result
str(ld_results[[1]])
# View the first few rows of the first result
head(ld_results[[1]])
# Combine all LD results into a single data frame
combined_ld_results <- do.call(rbind, ld_results)
# View the combined data
head(combined_ld_results)
# Filter SNPs with strong LD (e.g., r² > 0.8)
strong_ld_snps <- subset(combined_ld_results, R2 > 0.8)
# View the filtered SNPs
head(strong_ld_snps)
# Check the column names in the strong_ld_snps data frame
colnames(strong_ld_snps)
library(ggplot2)
# Plot the distribution of r² values
ggplot(combined_ld_results, aes(x = R2)) +
geom_histogram(binwidth = 0.05, fill = "blue", color = "black") +
labs(title = "Distribution of r² Values in LD Results",
x = "r²",
y = "Count") +
theme_minimal()
# Save the combined LD results to a CSV file
write.csv(combined_ld_results, "combined_ld_results.csv", row.names = FALSE)
# Save the filtered strong LD SNPs to a CSV file
write.csv(strong_ld_snps, "strong_ld_snps.csv", row.names = FALSE)
library(GenomicRanges)
# Assuming combined_ld_results is your full dataset
# Extract chromosome and start positions from the Coord column
combined_ld_results$chrom <- gsub(":.*", "", combined_ld_results$Coord) # Extract chromosome
combined_ld_results$start <- as.numeric(gsub(".*:", "", combined_ld_results$Coord)) # Extract start position
# For SNPs, start and end positions are typically the same
combined_ld_results$end <- combined_ld_results$start
combined_ld_results
# Check for missing values in the start and end columns
sum(is.na(combined_ld_results$start)) # Count missing start positions
sum(is.na(combined_ld_results$end)) # Count missing end positions
# Remove rows with NA in start or end positions
filtered_ld_results <- combined_ld_results[!is.na(combined_ld_results$start) &
!is.na(combined_ld_results$end), ]
# Create a GRanges object from the filtered dataset
ld_snps_gr <- GRanges(
seqnames = filtered_ld_results$chrom, # Chromosome
ranges = IRanges(start = filtered_ld_results$start, end = filtered_ld_results$end), # Start and end positions
strand = "*", # No strand info for SNPs
RS_Number = filtered_ld_results$RS_Number, # Add SNP IDs
R2 = filtered_ld_results$R2 # Add R2 values
)
# Check the GRanges object
ld_snps_gr
# Perform the overlap between LD SNPs and human enhancers
LDoverlaps <- findOverlaps(ld_snps_gr, human_enhancers)
# Extract the overlapping LD SNPs and enhancers
overlapping_ld_snps <- ld_snps_gr[queryHits(LDoverlaps)]
overlapping_enhancers_LD <- human_enhancers[subjectHits(LDoverlaps)]
# Create a data frame of the overlapping results
LDoverlap_df <- data.frame(
query_snp = overlapping_ld_snps$RS_Number,
enhancer_chr = seqnames(overlapping_enhancers_LD),
enhancer_start = start(overlapping_enhancers_LD),
enhancer_end = end(overlapping_enhancers_LD),
ld_snp_r2 = overlapping_ld_snps$R2 # Include R2 value for the LD SNPs
)
LDoverlap_df
# View the first few rows of the overlap results
head(LDoverlap_df)
# Count the number of LD SNPs per enhancer
ld_snp_counts_per_enhancer <- LDoverlap_df %>%
group_by(enhancer_chr, enhancer_start, enhancer_end) %>%
summarise(n_ld_snps = n())
# View the SNP counts per enhancer
print(ld_snp_counts_per_enhancer,n=500)
# Save the overlap results to a CSV file
write.csv(LDoverlap_df, "ld_human_enhancer_overlaps.csv", row.names = FALSE)
# Save the SNP counts per enhancer
write.csv(ld_snp_counts_per_enhancer, "ld_snp_counts_per_enhancer.csv", row.names = FALSE)
str(ld_results)
# Load necessary libraries
library(tidyverse)
library(GenomicRanges)
library(Biostrings)
# Load combined_file.csv
combined_for_tile <- read.csv("combined_file.csv")
# Remove duplicate entries based on the "X" column
combined_for_tile <- combined_for_tile %>% distinct(X, .keep_all = TRUE)
# Separate genomic coordinates
TileDf <- combined_for_tile %>%
dplyr::select(X) %>%
tidyr::separate(X, into = c("chromosome", "start", "end"), sep = "-")
TileDf$start <- as.numeric(TileDf$start)
TileDf$end <- as.numeric(TileDf$end)
TileDf <- TileDf %>% mutate(length_bp = end - start)
# 2. Split into Large and Small Regions
large_regions <- TileDf %>% filter(length_bp > 270)
small_regions <- TileDf %>% filter(length_bp <= 270)
# 3. Create GRanges for Large Regions
granges_large <- GRanges(
seqnames = large_regions$chromosome,
ranges = IRanges(start = large_regions$start, end = large_regions$end)
)
# 4. Tiling Large Regions (270 bp, 30 bp overlap)
tile_width <- 270
overlap_size <- 30
tiled_granges_list <- list()
for (i in seq_along(granges_large)) {
current_range <- granges_large[i]
tiles <- slidingWindows(current_range, width = tile_width, step = tile_width - overlap_size)
tiled_granges_list[[i]] <- tiles[[1]]
}
tiled_granges <- do.call(c, tiled_granges_list)
# 5. Load Genome
genome <- readDNAStringSet("Mus_musculus.GRCm38.dna.primary_assembly.fa.gz")
# Helper: Extract correct chromosome name from genome names
get_chr_name <- function(chr) {
match_chr <- grep(paste0("^", chr, " "), names(genome), value = TRUE)
if (length(match_chr) == 0) stop(paste("Chromosome", chr, "not found"))
return(match_chr)
}
# Helper: Extract genomic sequence
extract_sequence <- function(chr, start, end) {
chr_name <- get_chr_name(chr)
chr_seq <- genome[[chr_name]]
return(subseq(chr_seq, start = start, end = end))
}
# 6. Extract and Filter Large Tiled Sequences by GC Content
calculate_gc_content <- function(seq) {
gc_count <- sum(letterFrequency(DNAString(seq), letters = c("G", "C")))
return((gc_count / nchar(seq)) * 100)
}
seqlevels(tiled_granges) <- gsub("^chr", "", seqlevels(tiled_granges))
# Extract tiled sequences
tiled_sequences <- sapply(seq_along(tiled_granges), function(i) {
chr <- as.character(seqnames(tiled_granges[i]))
start <- start(tiled_granges[i])
end <- end(tiled_granges[i])
as.character(extract_sequence(chr, start, end))
})
# Filter by GC content <= 60%
tiled_gc <- sapply(tiled_sequences, calculate_gc_content)
filtered_tiled_sequences <- tiled_sequences[tiled_gc <= 60]
filtered_tiled_granges <- tiled_granges[tiled_gc <= 60]
# 7. Extract and Filter Small Sequences by GC Content
small_regions$chromosome <- gsub("^chr", "", small_regions$chromosome)
small_sequences <- sapply(seq_along(small_regions$chromosome), function(i) {
chr <- small_regions$chromosome[i]
start <- small_regions$start[i]
end <- small_regions$end[i]
as.character(extract_sequence(chr, start, end))
})
small_gc <- sapply(small_sequences, calculate_gc_content)
filtered_small_sequences <- small_sequences[small_gc <= 60]
filtered_small_regions <- small_regions[small_gc <= 60, ]
# 8. Add Adaptors
adaptor_left <- "AGGACCGGATCAACT"
adaptor_right <- "CATTGCGTGAACCGA"
# Add adaptors to sequences
final_tiled_seq <- paste0(adaptor_left, filtered_tiled_sequences, adaptor_right)
final_small_seq <- paste0(adaptor_left, filtered_small_sequences, adaptor_right)
# FASTA headers
tiled_headers <- paste0(">", seqnames(filtered_tiled_granges), ":", start(filtered_tiled_granges), "-", end(filtered_tiled_granges))
small_headers <- paste0(">", filtered_small_regions$chromosome, ":", filtered_small_regions$start, "-", filtered_small_regions$end)
# 9. Write FASTA File
final_fasta_lines <- c(
paste0(tiled_headers, "\n", final_tiled_seq),
paste0(small_headers, "\n", final_small_seq)
)
writeLines(final_fasta_lines, "filtered_tiled_and_small_enhancers.fa")
# 10. Count X and Y Chromosome Entries
tiled_chr <- as.character(seqnames(filtered_tiled_granges))
small_chr <- as.character(filtered_small_regions$chromosome)
xy_tiled_count <- sum(tiled_chr %in% c("X", "Y"))
xy_small_count <- sum(small_chr %in% c("X", "Y"))
total_xy_count <- xy_tiled_count + xy_small_count
total_tiled_count <- length(tiled_chr)
total_small_count <- nrow(filtered_small_regions)
total_combined <- total_tiled_count + total_small_count
cat("X/Y large tiled enhancers:", xy_tiled_count, "\n")
cat("X/Y small enhancers:", xy_small_count, "\n")
cat("Total X/Y enhancers:", total_xy_count, "\n")
cat("Total large tiled:", total_tiled_count, "\n")
cat("Total small:", total_small_count, "\n")
cat("Total combined:", total_combined, "\n")
All analyses were performed using R Statistical Software (v4.1.2; R Core Team 2021).