Materials & Methods

We picked 13 samples of the 300+ clone curated sequencing datasets available on NCBI SRA (Sequence read archive) from the Long-Term Evolution of E.coli (LTEE: https://the-ltee.org/), which are analyzed via breseq. This output is then corrected for GC and origin of replication bias. These bias corrected read counts across the genomic range are then used by a Hidden Markov Model (HMM) based copy number prediction model.

Figure 5: Workflow pipeline for breseq analysis, bias correction, and HMM generation

Data Pre-Processing

breseq outputs .bam files with reads aligned to a reference genome. The coverage is then calculated at each base and output as a .tab file. We pre-process this data for downstream operations; window length (1000bp), window slide length (500bp), summary metric per window, ori/ter settings and GC% calculation.

import os

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# import statsmodels.api as sm

from scipy import ndimage

import seaborn as sns

def preprocess(filepath:str):

df = pd.read_csv(filepath, delimiter = '\t',header = 0, index_col = 0)

#Get rid of redundant counts of coverage from the input file

df["unique_cov"] = df["unique_top_cov"]+df["unique_bot_cov"]

df["redundant"] = df['redundant_top_cov']+df['redundant_bot_cov']

df = df[df['redundant'] == 0]

genome = df["ref_base"].to_numpy()

genome_len = len(df)

#Total GC content of the genome

n_g = np.count_nonzero(genome==('G'))

n_c = np.count_nonzero(genome==('C'))

t_gcp = ((n_g+n_c)/genome_len)*100

#Origin and terminus of genomic replication coordinates from the .gbk reference if the E.coli ancestor

ori = 3886082

ter = 1567362

# length = 4629812

winseq = []

seq = []

gcp_s = []

windo = []

windo_med_cov = []

windo_mean = []

windo_var = []

gc_df = pd.DataFrame(columns = ("window","sequence","win_median","GC%"))

win_len = 1000

win_shift = 1000

i = 0

#inspect the genome contents through a sliding window of 1000bp

while(i <= (genome_len - win_len)):

winseq = genome[i:(i + win_len)]

seq.insert(i,str(''.join(str(element) for element in winseq)))

gcc = np.count_nonzero(winseq == 'G') + np.count_nonzero(winseq == 'C')

gccp = (gcc/win_len)*100

gcp_s.insert(i,gccp)

windo_med_cov.insert(i,float(df.iloc[i:(i + win_len),9].median()))

# windo_mean.insert(i, float(df.iloc[i:(i + win_len),9].mean()))

# windo_var.insert(i, float(np.var(df.iloc[i:(i + win_len),9])))

windo.insert(i,i + win_len)

i = i + win_shift

#Save the window median and

gc_df["GC%"] = gcp_s

gc_df["sequence"] = seq

gc_df["win_median"] = windo_med_cov

gc_df["window"] = windo

gc_df["norm_cov"] = gc_df["win_median"]/gc_df["win_median"].median()

gc_df["norm_gc"] = gc_df["GC%"]/gc_df["GC%"].median()

# print(f'Genome length : {genome_len}')

# print(f'Genome GC% : {t_gcp}')

return (gc_df, ori, ter)

GC Bias Correction and plotting

This step takes the pre-processed data, calculates bias trends using lowess regression, and outputs GC-bias corrected read counts in each window as well as their coverage plots

def find_nearest(array, value):

array = np.asarray(array)

idx = (np.abs(array - value)).argmin()

return idx

def gc_normalization(filepath):

df, ori, ter = preprocess(filepath)

length = len(df)

half = length/2

windos = df["window"]

closest, farthest = int(length * 0.45), int(length * 0.55)

x1, x2 = find_nearest(windos,ter) , find_nearest(windos,ori)

dist1 = x2 - x1

cov = df["norm_cov"]

gc = df["norm_gc"]

lowess1 = sm.nonparametric.lowess

gc_out = lowess1(cov, gc,frac=0.8, it=3, delta=0.0, is_sorted=False, missing='none', return_sorted=False)

df["gc_cor"] = cov/gc_out

# gc_cor_med_fil = ndimage.median_filter(df["gc_cor"],size=200,mode="reflect")

# xmin = np.argmin(gc_cor_med_fil)

# xmax = np.argmax(gc_cor_med_fil)

# dist2 = xmax - xmin

med = df["win_median"].median()

cor_rc = []

gc_plt = []

for i in range(len(df)):

cor_rc.insert(i, int (df["gc_cor"].iloc[i] * med))

# gc_plt.insert(i, int (df["norm_gc"].iloc[i] * med))

df["gc_cor_rc"] = cor_rc

# df["gc_plt"] = gc_plt

return (df, x1, x2)

def plot_gc_cor(filepath, samplename):

df, x1, x2 = gc_normalization(filepath)

# samplename = filepath.strip().split('/')[-1]

dir_split = filepath.strip().split('/')[:-1]

saveplt = str('/'.join(dir_split))+("/Bias_correctrion_output/bias_cor_out/")

ter=df["window"].iloc[x1]

ori=df["window"].iloc[x2]

plt.figure(figsize=(10, 8))

plt.scatter(df['window'], df['norm_gc'], color='brown', label='GC%_normalized_reads', s=3)

plt.scatter(df["window"],df["norm_cov"], color="gray", label='Raw reads', s=10, alpha =0.7)

plt.scatter(df['window'], df['gc_cor'], color='black', label='GC-corrected', s=5)

# plt.plot(df['window'], df['gc_cor_med_fil'], color='gray')

plt.axvline(x=ter, color='r', linestyle=':', label=f'Terminus: {ter}')

plt.axvline(x=ori, color='r', linestyle=':', label=f'Origin: {ori}')

# Adding labels and title

plt.xlabel('Window (Genomic Position)')

plt.ylabel('Normalized reads / GC%')

plt.title(f'{samplename}_GC bias correction')

plt.legend()

plt_full_path =os.path.join(saveplt,'%s_GC_corr.png' % samplename.replace(' ', '_'))

plt.savefig(plt_full_path, format = 'png', bbox_inches = 'tight')

plt.close()

def gc_cor_graphs(filepath, samplename):

df, x1, x2 = gc_normalization(filepath)

# samplename = filepath.strip().split('/')[-1]

dir_split = filepath.strip().split('/')[:-1]

saveplt = str('/'.join(dir_split))+("/Bias_correctrion_output/GC_cor_plots/")

ter=df["window"].iloc[x1]

ori=df["window"].iloc[x2]

plt.figure(figsize=(10, 8))

plt.scatter(df['norm_gc'], df['norm_cov'], color='lightblue', label='Raw normalized reads vs GC', s=5)

plt.scatter(df["norm_gc"], df["gc_cor"], color="darkblue", label='Corrected normalized reads', s=10, alpha = 0.7)

# plt.scatter(df["window"],df["gc_plt"], color="orange", label='GC% / window',s=3)

# Adding labels and title

plt.ylabel('Normalized Reads')

plt.xlabel('Genome normalized GC%')

plt.title(f'{samplename}_GCvsNormalizedReads')

plt.legend()

plt_full_path =os.path.join(saveplt,'%s_GC_vs_NormRds.png' % samplename.replace(' ', '_'))

plt.savefig(plt_full_path, format = 'png', bbox_inches = 'tight')

plt.close()

Origin of Replication Bias Correction and plotting

This step takes the GC-bias corrected data, ori/ter coordinates, calculates bias trends within those coordinates using lowess regression, and outputs ori/ter-bias corrected read counts in each window as well as their coverage plots

def ori_lowess(df):

x=df["window"]

y=df["gc_cor"]

lowess=sm.nonparametric.lowess

yout=lowess(y, x,frac=0.8, it=3, delta=0.0,

is_sorted=False, missing='none', return_sorted=False)

ycor=list(y/yout)

return ycor

def otr_correction(filepath):

df, x1, x2 = gc_normalization(filepath)

corr = []

if df.iloc[x1].name > int(len(df)*0.1):

seg1 = ori_lowess(df.iloc[:x1])

else:

seg1 = list(df["gc_cor"].iloc[:x1])

if df.iloc[x2].name < int(len(df)*0.9):

seg3 = ori_lowess(df.iloc[x2:])

else:

seg3=list(df['gc_cor'].iloc[x2:])

seg2 = ori_lowess(df.iloc[x1:x2])

corr = seg1+seg2+seg3

df["otr_cor"] = corr

# df["otr_cor_med_fil"] = ndimage.median_filter(df["otr_cor"],size=200,mode="reflect")

return (df, x1, x2)

def plot_otr_corr(filepath, samplename):

# samplename = filepath.strip().split('/')[-1]

dir_split = filepath.strip().split('/')[:-1]

saveplt = str('/'.join(dir_split))+("/Bias_correctrion_output/bias_cor_out/")

saveloc = str('/'.join(dir_split))+("/Bias_correctrion_output/")

df, x1 , x2 = otr_correction(filepath)

ter=df["window"].iloc[x1]

ori=df["window"].iloc[x2]

med = df["win_median"].median()

cor_rc = []

for i in range(len(df)):

cor_rc.insert(i, int (df["otr_cor"].iloc[i] * med))

df["corrected_rc"] = cor_rc

df['corrected_rc'] = df['corrected_rc'].replace(0 , 1)

df["mean"] = np.mean(cor_rc)

df["var"] = np.var(cor_rc)

plt.figure(figsize=(10, 8))

plt.scatter(df["window"],df["norm_cov"], color="gray", label="Raw reads",s=8)

plt.scatter(df["window"],df["gc_cor"], color="brown", label="GC corrected", s=5, alpha = 0.6)

plt.scatter(df["window"],df["otr_cor"], color = 'black', label="Ori/Ter bias corrected", s = 10)

# plt.plot(df["window"],df["otr_cor_med_fil"], color="green")

plt.axvline(x=ter, color='r', linestyle=':', label=f'Terminus: {ter}')

plt.axvline(x=ori, color='r', linestyle=':', label=f'Origin: {ori}')

plt.xlabel("Window (Genomic position)")

plt.ylabel("Normalized reads")

plt.title(f'{samplename}_Ori/Ter bias correction')

plt.legend()

plt_full_path = os.path.join(saveplt,'%s_OTR_corr.png' % samplename.replace(' ', '_'))

csv_full_path = os.path.join(saveloc,'%s_bias-correct.csv' % samplename.replace(' ', '_'))

plt.savefig(plt_full_path, format = 'png', bbox_inches = 'tight')

df.reset_index(drop = True)

df.to_csv(csv_full_path)

plt.close()

Hidden Markov Model (HMM) coding and implementation

We adapted the implementation of HMM in python from the CNV detection pipeline CNOGpro (an R package), used to predict most probable copy number states in normalized read count data. (Bryanildsrud, 2015). Here we output segments of genomic regions that are most likely from similar states (copy numbers), therefore we get a table of genomic segments indicating "breaks" in copy number states.

import numpy as np

from scipy.special import gammaln

import pandas as pd

from scipy.stats import geom

import os

import seaborn as sns

from matplotlib.backends.backend_pdf import PdfPages

def HMM_copy_number(obs, transition_matrix, emission_matrix, include_zero_state, window_length, chr_length):

states = np.arange(0, emission_matrix.shape[0] + 1) # Assuming state indices start from 1

print("Attempting to create Viterbi matrix")

v = make_viterbi_mat(obs, transition_matrix, emission_matrix, include_zero_state)

# Go through each of the rows of the matrix v and find out which column has the maximum value for that row

most_probable_state_path = np.argmax(v, axis=1)

# np.savetxt('MPSS.csv', most_probable_state_path, delimiter = ',')

results = pd.DataFrame(columns=['Startpos', 'Endpos', 'State'])

prev_obs = obs[0]

prev_most_probable_state = most_probable_state_path[0]

prev_most_probable_state_name = states[prev_most_probable_state] # Adjust for 0-based indexing

start_pos = 1

for i in range(1, len(obs)):

observation = obs[i]

most_probable_state = most_probable_state_path[i]

most_probable_state_name = states[most_probable_state] # Adjust for 0-based indexing

if most_probable_state_name != prev_most_probable_state_name:

print(f"Positions {start_pos} - {i * window_length}: Most probable state = {prev_most_probable_state_name}")

results = results._append({'Startpos': start_pos, 'Endpos': i * window_length,

'State': prev_most_probable_state_name}, ignore_index=True)

start_pos = i * window_length + 1

prev_obs = observation

prev_most_probable_state_name = most_probable_state_name

print(f"Positions {start_pos} - {chr_length}: Most probable state = {prev_most_probable_state_name}")

results = results._append({'Startpos': start_pos, 'Endpos': chr_length,

'State': prev_most_probable_state_name}, ignore_index=True)

return results

def run_HMM(experiment, n_states=5, changeprob=0.0001, include_zero_state=True, error_rate=0):

print("Running HMM...")

read_counts = experiment['corrected_rc']

new_exp = experiment.copy()

rc_max = np.max(read_counts)

this_emission = setup_emission_matrix(n_states=n_states, mean=experiment['win_median'].mean(), variance=np.var(experiment['win_median']),

absmax=rc_max, include_zero_state=include_zero_state , error_rate=error_rate)

this_transition = setup_transition_matrix(n_states, remain_prob=(1 - changeprob), include_zero_state=include_zero_state)

print("Finished setting up transition and emission matrices. Starting Viterbi algorithm...")

copy_numbers = HMM_copy_number(read_counts, this_transition, this_emission,

include_zero_state, 500, experiment['window'].max())

print("Finished running Viterbi algorithm. Assigning most probable states to individual segments...")

CN_HMM = []

for cnrow in range(len(copy_numbers)):

state = int(copy_numbers['State'][cnrow])

hmmstart = int(copy_numbers['Startpos'][cnrow])

hmmend = int(copy_numbers['Endpos'][cnrow])

CN_HMM_row = []

idx = 0

for win in experiment['window']:

if (win >= hmmstart) and (win <= hmmend):

CN_HMM_row.append(state)

idx+=1

CN_HMM.extend(CN_HMM_row)

new_exp['CN_HMM'] = CN_HMM

print("Copy number analysis complete.")

return new_exp

Figure 6: HMM architecture showing n_states = possible copy numbers and the transition/emission probabilities in those states.

Emission Probabilities:

The emission matrix stores the log probabilities of each possible read count in each possible copy number state (in this case 0-5). The probabilites of each state are calculated via negative binomial distribution (also known as a Gamma-Poisson distribution). To do this, the input read counts are sampled to determine the variance and means. These two values are then used to calculate p and r, which are used as the parameters of the Gamma-Poisson distribution. (Bryanildsrud, 2015).

def solve_pr(mean, variance):

r = (mean * mean)/(variance - mean)

p = 1 - (mean/variance)

return p, r

def calculate_prob(p, r, obs):

probs = np.exp(gammaln(r + obs) - gammaln(obs + 1) - gammaln(r) + obs * np.log(p) + r * np.log(1 - p))

return probs

def setup_emission_matrix(n_states, mean, variance, absmax, include_zero_state=True, error_rate=0):

emission = np.full((n_states, absmax + 1), np.nan)

for state in range(n_states):

pr = solve_pr(mean * (state + 1), variance * (state + 1))

p, r = pr[0], pr[1]

for obs in range(absmax + 1):

# Ensure obs is converted to a float or NumPy array before applying np.log

emission[state, obs] = calculate_prob(p, r, obs)

if include_zero_state:

zero_row = np.array([geom.pmf(i, 1 - error_rate) for i in range(absmax + 1)])

emission = np.vstack((zero_row, emission))

# np.savetxt("emission.csv", emission, delimiter=",")

return emission

Figure 7: relationship between mean, variance, p and r (Bryanildsrud, 2015).

Transition Probabilities:

The transition matrix stores the log probabilities of switching between the possible states, in this case, between copy number. The probabilities of switching states is defined by q, with the probability of remaining in the same state being calculated as q - 1 (all remaining transitions are assumed to be equally probable), with the value of q = 0.0001. (Bryanildsrud, 2015).

def setup_transition_matrix(n_states, remain_prob, include_zero_state=True):

if include_zero_state:

n_states += 1

change_prob = 1 - remain_prob

per_state_prob = change_prob / (n_states - 1)

transition = np.full((n_states, n_states), per_state_prob)

for i in range(n_states):

transition[i, i] = remain_prob

# np.savetxt("transition.csv", transition, delimiter=",")

return transition

Viterbi path matrix:

The Viterbi path is calculated as the most likely state sequence, with each step being dependent upon the previous state as well as the emission and transition probabilities. (Bryanildsrud, 2015).

def make_viterbi_mat(obs, transition_matrix, emission_matrix, include_zero_state):

num_states = transition_matrix.shape[0]

logv = np.full((len(obs), num_states), np.nan)

logtrans = np.log(transition_matrix)

logemi = np.log(emission_matrix)

logv[0, :] = np.log(0)

if include_zero_state:

logv[0, 1] = np.log(1) # log(1) = 0

else:

logv[0, 0] = np.log(1) # log(1) = 0

for i in range(1, len(obs)):

for l in range(num_states):

statelprobcounti = logemi[l, obs[i]]

maxstate = max(logv[i - 1, :] + logtrans[l, :])

logv[i, l] = statelprobcounti + maxstate

# np.savetxt("viterbi.csv", logv, delimiter = ',')

return logv

Plot copy number predictions and the corrected read-counts:

def plot_copy(filepath, samplename):

# samplename = filepath.strip().split('/')[-1]

dir_split = filepath.strip().split('/')[:-1]

saveplt = str('/'.join(dir_split))+("/CNV_out/")

saveloc = str('/'.join(dir_split))

df_cor = pd.read_csv(filepath, delimiter = ',',header = 0, index_col = 0)

# df_cor['corrected_rc'] = df_cor['corrected_rc'].replace(0 , 1)

df_cnv = run_HMM(df_cor)

plt.figure(figsize=(10, 8))

fig, ax1 = plt.subplots()

ax2 = ax1.twinx()

ax1.scatter(df_cnv["window"],df_cnv["win_median"], color="gray", label="Raw reads",s=10)

ax1.scatter(df_cnv["window"],df_cnv["corrected_rc"], color="pink", label="Corrected reads",s=5)

ax2.scatter(df_cnv["window"],df_cnv["CN_HMM"], color="red", label="Predicted Copy Number", s=5)

# plt.scatter(df["window"],df["otr_cor"], color="lightblue", label="Ori/Ter bias corrected", s=5)

# ax1.set_ylim(df_cnv['otr_cor'].min()-0.5, df_cnv['otr_cor'].max()+1 )

ax2.set_ylim(df_cnv['CN_HMM'].min()-0.5, df_cnv['CN_HMM'].max()+1 )

# plt.axvline(x=ter, color='r', linestyle=':', label=f'Terminus: {ter}')

# plt.axvline(x=ori, color='r', linestyle=':', label=f'Origin: {ori}')

ax1.set_xlabel("Window (Genomic position)")

ax2.yaxis.label.set_color('red')

ax1.set_ylabel("Read counts")

ax2.set_ylabel("Copy Numbers")

plt.title(f'{samplename}_Copy Number Prediction')

ax1.legend()

# ax2.legend()

pdf_full_path = os.path.join(saveplt,'%s_copy_numbers.pdf' % samplename)

csv_full_path = os.path.join(saveloc,'%s_CNV.csv' % samplename)

pdf = PdfPages(pdf_full_path)

pdf.savefig(bbox_inches = 'tight')

df_cnv.reset_index(drop = True)

df_cnv.to_csv(csv_full_path)

plt.close()

pdf.close()