We picked 13 samples of the 300+ clone curated sequencing datasets available on NCBI SRA (Sequence read archive) from the Long-Term Evolution of E.coli (LTEE: https://the-ltee.org/), which are analyzed via breseq. This output is then corrected for GC and origin of replication bias. These bias corrected read counts across the genomic range are then used by a Hidden Markov Model (HMM) based copy number prediction model.
Figure 5: Workflow pipeline for breseq analysis, bias correction, and HMM generation
Data Pre-Processing
breseq outputs .bam files with reads aligned to a reference genome. The coverage is then calculated at each base and output as a .tab file. We pre-process this data for downstream operations; window length (1000bp), window slide length (500bp), summary metric per window, ori/ter settings and GC% calculation.
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import statsmodels.api as sm
from scipy import ndimage
import seaborn as sns
def preprocess(filepath:str):
df = pd.read_csv(filepath, delimiter = '\t',header = 0, index_col = 0)
#Get rid of redundant counts of coverage from the input file
df["unique_cov"] = df["unique_top_cov"]+df["unique_bot_cov"]
df["redundant"] = df['redundant_top_cov']+df['redundant_bot_cov']
df = df[df['redundant'] == 0]
genome = df["ref_base"].to_numpy()
genome_len = len(df)
#Total GC content of the genome
n_g = np.count_nonzero(genome==('G'))
n_c = np.count_nonzero(genome==('C'))
t_gcp = ((n_g+n_c)/genome_len)*100
#Origin and terminus of genomic replication coordinates from the .gbk reference if the E.coli ancestor
ori = 3886082
ter = 1567362
# length = 4629812
winseq = []
seq = []
gcp_s = []
windo = []
windo_med_cov = []
windo_mean = []
windo_var = []
gc_df = pd.DataFrame(columns = ("window","sequence","win_median","GC%"))
win_len = 1000
win_shift = 1000
i = 0
#inspect the genome contents through a sliding window of 1000bp
while(i <= (genome_len - win_len)):
winseq = genome[i:(i + win_len)]
seq.insert(i,str(''.join(str(element) for element in winseq)))
gcc = np.count_nonzero(winseq == 'G') + np.count_nonzero(winseq == 'C')
gccp = (gcc/win_len)*100
gcp_s.insert(i,gccp)
windo_med_cov.insert(i,float(df.iloc[i:(i + win_len),9].median()))
# windo_mean.insert(i, float(df.iloc[i:(i + win_len),9].mean()))
# windo_var.insert(i, float(np.var(df.iloc[i:(i + win_len),9])))
windo.insert(i,i + win_len)
i = i + win_shift
#Save the window median and
gc_df["GC%"] = gcp_s
gc_df["sequence"] = seq
gc_df["win_median"] = windo_med_cov
gc_df["window"] = windo
gc_df["norm_cov"] = gc_df["win_median"]/gc_df["win_median"].median()
gc_df["norm_gc"] = gc_df["GC%"]/gc_df["GC%"].median()
# print(f'Genome length : {genome_len}')
# print(f'Genome GC% : {t_gcp}')
return (gc_df, ori, ter)
GC Bias Correction and plotting
This step takes the pre-processed data, calculates bias trends using lowess regression, and outputs GC-bias corrected read counts in each window as well as their coverage plots
def find_nearest(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return idx
def gc_normalization(filepath):
df, ori, ter = preprocess(filepath)
length = len(df)
half = length/2
windos = df["window"]
closest, farthest = int(length * 0.45), int(length * 0.55)
x1, x2 = find_nearest(windos,ter) , find_nearest(windos,ori)
dist1 = x2 - x1
cov = df["norm_cov"]
gc = df["norm_gc"]
lowess1 = sm.nonparametric.lowess
gc_out = lowess1(cov, gc,frac=0.8, it=3, delta=0.0, is_sorted=False, missing='none', return_sorted=False)
df["gc_cor"] = cov/gc_out
# gc_cor_med_fil = ndimage.median_filter(df["gc_cor"],size=200,mode="reflect")
# xmin = np.argmin(gc_cor_med_fil)
# xmax = np.argmax(gc_cor_med_fil)
# dist2 = xmax - xmin
med = df["win_median"].median()
cor_rc = []
gc_plt = []
for i in range(len(df)):
cor_rc.insert(i, int (df["gc_cor"].iloc[i] * med))
# gc_plt.insert(i, int (df["norm_gc"].iloc[i] * med))
df["gc_cor_rc"] = cor_rc
# df["gc_plt"] = gc_plt
return (df, x1, x2)
def plot_gc_cor(filepath, samplename):
df, x1, x2 = gc_normalization(filepath)
# samplename = filepath.strip().split('/')[-1]
dir_split = filepath.strip().split('/')[:-1]
saveplt = str('/'.join(dir_split))+("/Bias_correctrion_output/bias_cor_out/")
ter=df["window"].iloc[x1]
ori=df["window"].iloc[x2]
plt.figure(figsize=(10, 8))
plt.scatter(df['window'], df['norm_gc'], color='brown', label='GC%_normalized_reads', s=3)
plt.scatter(df["window"],df["norm_cov"], color="gray", label='Raw reads', s=10, alpha =0.7)
plt.scatter(df['window'], df['gc_cor'], color='black', label='GC-corrected', s=5)
# plt.plot(df['window'], df['gc_cor_med_fil'], color='gray')
plt.axvline(x=ter, color='r', linestyle=':', label=f'Terminus: {ter}')
plt.axvline(x=ori, color='r', linestyle=':', label=f'Origin: {ori}')
# Adding labels and title
plt.xlabel('Window (Genomic Position)')
plt.ylabel('Normalized reads / GC%')
plt.title(f'{samplename}_GC bias correction')
plt.legend()
plt_full_path =os.path.join(saveplt,'%s_GC_corr.png' % samplename.replace(' ', '_'))
plt.savefig(plt_full_path, format = 'png', bbox_inches = 'tight')
plt.close()
def gc_cor_graphs(filepath, samplename):
df, x1, x2 = gc_normalization(filepath)
# samplename = filepath.strip().split('/')[-1]
dir_split = filepath.strip().split('/')[:-1]
saveplt = str('/'.join(dir_split))+("/Bias_correctrion_output/GC_cor_plots/")
ter=df["window"].iloc[x1]
ori=df["window"].iloc[x2]
plt.figure(figsize=(10, 8))
plt.scatter(df['norm_gc'], df['norm_cov'], color='lightblue', label='Raw normalized reads vs GC', s=5)
plt.scatter(df["norm_gc"], df["gc_cor"], color="darkblue", label='Corrected normalized reads', s=10, alpha = 0.7)
# plt.scatter(df["window"],df["gc_plt"], color="orange", label='GC% / window',s=3)
# Adding labels and title
plt.ylabel('Normalized Reads')
plt.xlabel('Genome normalized GC%')
plt.title(f'{samplename}_GCvsNormalizedReads')
plt.legend()
plt_full_path =os.path.join(saveplt,'%s_GC_vs_NormRds.png' % samplename.replace(' ', '_'))
plt.savefig(plt_full_path, format = 'png', bbox_inches = 'tight')
plt.close()
Origin of Replication Bias Correction and plotting
This step takes the GC-bias corrected data, ori/ter coordinates, calculates bias trends within those coordinates using lowess regression, and outputs ori/ter-bias corrected read counts in each window as well as their coverage plots
def ori_lowess(df):
x=df["window"]
y=df["gc_cor"]
lowess=sm.nonparametric.lowess
yout=lowess(y, x,frac=0.8, it=3, delta=0.0,
is_sorted=False, missing='none', return_sorted=False)
ycor=list(y/yout)
return ycor
def otr_correction(filepath):
df, x1, x2 = gc_normalization(filepath)
corr = []
if df.iloc[x1].name > int(len(df)*0.1):
seg1 = ori_lowess(df.iloc[:x1])
else:
seg1 = list(df["gc_cor"].iloc[:x1])
if df.iloc[x2].name < int(len(df)*0.9):
seg3 = ori_lowess(df.iloc[x2:])
else:
seg3=list(df['gc_cor'].iloc[x2:])
seg2 = ori_lowess(df.iloc[x1:x2])
corr = seg1+seg2+seg3
df["otr_cor"] = corr
# df["otr_cor_med_fil"] = ndimage.median_filter(df["otr_cor"],size=200,mode="reflect")
return (df, x1, x2)
def plot_otr_corr(filepath, samplename):
# samplename = filepath.strip().split('/')[-1]
dir_split = filepath.strip().split('/')[:-1]
saveplt = str('/'.join(dir_split))+("/Bias_correctrion_output/bias_cor_out/")
saveloc = str('/'.join(dir_split))+("/Bias_correctrion_output/")
df, x1 , x2 = otr_correction(filepath)
ter=df["window"].iloc[x1]
ori=df["window"].iloc[x2]
med = df["win_median"].median()
cor_rc = []
for i in range(len(df)):
cor_rc.insert(i, int (df["otr_cor"].iloc[i] * med))
df["corrected_rc"] = cor_rc
df['corrected_rc'] = df['corrected_rc'].replace(0 , 1)
df["mean"] = np.mean(cor_rc)
df["var"] = np.var(cor_rc)
plt.figure(figsize=(10, 8))
plt.scatter(df["window"],df["norm_cov"], color="gray", label="Raw reads",s=8)
plt.scatter(df["window"],df["gc_cor"], color="brown", label="GC corrected", s=5, alpha = 0.6)
plt.scatter(df["window"],df["otr_cor"], color = 'black', label="Ori/Ter bias corrected", s = 10)
# plt.plot(df["window"],df["otr_cor_med_fil"], color="green")
plt.axvline(x=ter, color='r', linestyle=':', label=f'Terminus: {ter}')
plt.axvline(x=ori, color='r', linestyle=':', label=f'Origin: {ori}')
plt.xlabel("Window (Genomic position)")
plt.ylabel("Normalized reads")
plt.title(f'{samplename}_Ori/Ter bias correction')
plt.legend()
plt_full_path = os.path.join(saveplt,'%s_OTR_corr.png' % samplename.replace(' ', '_'))
csv_full_path = os.path.join(saveloc,'%s_bias-correct.csv' % samplename.replace(' ', '_'))
plt.savefig(plt_full_path, format = 'png', bbox_inches = 'tight')
df.reset_index(drop = True)
df.to_csv(csv_full_path)
plt.close()
Hidden Markov Model (HMM) coding and implementation
We adapted the implementation of HMM in python from the CNV detection pipeline CNOGpro (an R package), used to predict most probable copy number states in normalized read count data. (Bryanildsrud, 2015). Here we output segments of genomic regions that are most likely from similar states (copy numbers), therefore we get a table of genomic segments indicating "breaks" in copy number states.
import numpy as np
from scipy.special import gammaln
import pandas as pd
from scipy.stats import geom
import os
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
def HMM_copy_number(obs, transition_matrix, emission_matrix, include_zero_state, window_length, chr_length):
states = np.arange(0, emission_matrix.shape[0] + 1) # Assuming state indices start from 1
print("Attempting to create Viterbi matrix")
v = make_viterbi_mat(obs, transition_matrix, emission_matrix, include_zero_state)
# Go through each of the rows of the matrix v and find out which column has the maximum value for that row
most_probable_state_path = np.argmax(v, axis=1)
# np.savetxt('MPSS.csv', most_probable_state_path, delimiter = ',')
results = pd.DataFrame(columns=['Startpos', 'Endpos', 'State'])
prev_obs = obs[0]
prev_most_probable_state = most_probable_state_path[0]
prev_most_probable_state_name = states[prev_most_probable_state] # Adjust for 0-based indexing
start_pos = 1
for i in range(1, len(obs)):
observation = obs[i]
most_probable_state = most_probable_state_path[i]
most_probable_state_name = states[most_probable_state] # Adjust for 0-based indexing
if most_probable_state_name != prev_most_probable_state_name:
print(f"Positions {start_pos} - {i * window_length}: Most probable state = {prev_most_probable_state_name}")
results = results._append({'Startpos': start_pos, 'Endpos': i * window_length,
'State': prev_most_probable_state_name}, ignore_index=True)
start_pos = i * window_length + 1
prev_obs = observation
prev_most_probable_state_name = most_probable_state_name
print(f"Positions {start_pos} - {chr_length}: Most probable state = {prev_most_probable_state_name}")
results = results._append({'Startpos': start_pos, 'Endpos': chr_length,
'State': prev_most_probable_state_name}, ignore_index=True)
return results
def run_HMM(experiment, n_states=5, changeprob=0.0001, include_zero_state=True, error_rate=0):
print("Running HMM...")
read_counts = experiment['corrected_rc']
new_exp = experiment.copy()
rc_max = np.max(read_counts)
this_emission = setup_emission_matrix(n_states=n_states, mean=experiment['win_median'].mean(), variance=np.var(experiment['win_median']),
absmax=rc_max, include_zero_state=include_zero_state , error_rate=error_rate)
this_transition = setup_transition_matrix(n_states, remain_prob=(1 - changeprob), include_zero_state=include_zero_state)
print("Finished setting up transition and emission matrices. Starting Viterbi algorithm...")
copy_numbers = HMM_copy_number(read_counts, this_transition, this_emission,
include_zero_state, 500, experiment['window'].max())
print("Finished running Viterbi algorithm. Assigning most probable states to individual segments...")
CN_HMM = []
for cnrow in range(len(copy_numbers)):
state = int(copy_numbers['State'][cnrow])
hmmstart = int(copy_numbers['Startpos'][cnrow])
hmmend = int(copy_numbers['Endpos'][cnrow])
CN_HMM_row = []
idx = 0
for win in experiment['window']:
if (win >= hmmstart) and (win <= hmmend):
CN_HMM_row.append(state)
idx+=1
CN_HMM.extend(CN_HMM_row)
new_exp['CN_HMM'] = CN_HMM
print("Copy number analysis complete.")
return new_exp
Figure 6: HMM architecture showing n_states = possible copy numbers and the transition/emission probabilities in those states.
The emission matrix stores the log probabilities of each possible read count in each possible copy number state (in this case 0-5). The probabilites of each state are calculated via negative binomial distribution (also known as a Gamma-Poisson distribution). To do this, the input read counts are sampled to determine the variance and means. These two values are then used to calculate p and r, which are used as the parameters of the Gamma-Poisson distribution. (Bryanildsrud, 2015).
def solve_pr(mean, variance):
r = (mean * mean)/(variance - mean)
p = 1 - (mean/variance)
return p, r
def calculate_prob(p, r, obs):
probs = np.exp(gammaln(r + obs) - gammaln(obs + 1) - gammaln(r) + obs * np.log(p) + r * np.log(1 - p))
return probs
def setup_emission_matrix(n_states, mean, variance, absmax, include_zero_state=True, error_rate=0):
emission = np.full((n_states, absmax + 1), np.nan)
for state in range(n_states):
pr = solve_pr(mean * (state + 1), variance * (state + 1))
p, r = pr[0], pr[1]
for obs in range(absmax + 1):
# Ensure obs is converted to a float or NumPy array before applying np.log
emission[state, obs] = calculate_prob(p, r, obs)
if include_zero_state:
zero_row = np.array([geom.pmf(i, 1 - error_rate) for i in range(absmax + 1)])
emission = np.vstack((zero_row, emission))
# np.savetxt("emission.csv", emission, delimiter=",")
return emission
Figure 7: relationship between mean, variance, p and r (Bryanildsrud, 2015).
The transition matrix stores the log probabilities of switching between the possible states, in this case, between copy number. The probabilities of switching states is defined by q, with the probability of remaining in the same state being calculated as q - 1 (all remaining transitions are assumed to be equally probable), with the value of q = 0.0001. (Bryanildsrud, 2015).
def setup_transition_matrix(n_states, remain_prob, include_zero_state=True):
if include_zero_state:
n_states += 1
change_prob = 1 - remain_prob
per_state_prob = change_prob / (n_states - 1)
transition = np.full((n_states, n_states), per_state_prob)
for i in range(n_states):
transition[i, i] = remain_prob
# np.savetxt("transition.csv", transition, delimiter=",")
return transition
The Viterbi path is calculated as the most likely state sequence, with each step being dependent upon the previous state as well as the emission and transition probabilities. (Bryanildsrud, 2015).
def make_viterbi_mat(obs, transition_matrix, emission_matrix, include_zero_state):
num_states = transition_matrix.shape[0]
logv = np.full((len(obs), num_states), np.nan)
logtrans = np.log(transition_matrix)
logemi = np.log(emission_matrix)
logv[0, :] = np.log(0)
if include_zero_state:
logv[0, 1] = np.log(1) # log(1) = 0
else:
logv[0, 0] = np.log(1) # log(1) = 0
for i in range(1, len(obs)):
for l in range(num_states):
statelprobcounti = logemi[l, obs[i]]
maxstate = max(logv[i - 1, :] + logtrans[l, :])
logv[i, l] = statelprobcounti + maxstate
# np.savetxt("viterbi.csv", logv, delimiter = ',')
return logv
def plot_copy(filepath, samplename):
# samplename = filepath.strip().split('/')[-1]
dir_split = filepath.strip().split('/')[:-1]
saveplt = str('/'.join(dir_split))+("/CNV_out/")
saveloc = str('/'.join(dir_split))
df_cor = pd.read_csv(filepath, delimiter = ',',header = 0, index_col = 0)
# df_cor['corrected_rc'] = df_cor['corrected_rc'].replace(0 , 1)
df_cnv = run_HMM(df_cor)
plt.figure(figsize=(10, 8))
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.scatter(df_cnv["window"],df_cnv["win_median"], color="gray", label="Raw reads",s=10)
ax1.scatter(df_cnv["window"],df_cnv["corrected_rc"], color="pink", label="Corrected reads",s=5)
ax2.scatter(df_cnv["window"],df_cnv["CN_HMM"], color="red", label="Predicted Copy Number", s=5)
# plt.scatter(df["window"],df["otr_cor"], color="lightblue", label="Ori/Ter bias corrected", s=5)
# ax1.set_ylim(df_cnv['otr_cor'].min()-0.5, df_cnv['otr_cor'].max()+1 )
ax2.set_ylim(df_cnv['CN_HMM'].min()-0.5, df_cnv['CN_HMM'].max()+1 )
# plt.axvline(x=ter, color='r', linestyle=':', label=f'Terminus: {ter}')
# plt.axvline(x=ori, color='r', linestyle=':', label=f'Origin: {ori}')
ax1.set_xlabel("Window (Genomic position)")
ax2.yaxis.label.set_color('red')
ax1.set_ylabel("Read counts")
ax2.set_ylabel("Copy Numbers")
plt.title(f'{samplename}_Copy Number Prediction')
ax1.legend()
# ax2.legend()
pdf_full_path = os.path.join(saveplt,'%s_copy_numbers.pdf' % samplename)
csv_full_path = os.path.join(saveloc,'%s_CNV.csv' % samplename)
pdf = PdfPages(pdf_full_path)
pdf.savefig(bbox_inches = 'tight')
df_cnv.reset_index(drop = True)
df_cnv.to_csv(csv_full_path)
plt.close()
pdf.close()