Smith Waterman Algorithm

Post date: Jul 01, 2017 9:39:8 AM

The Smith Waterman algorithm matches two sequences to find common sub-sequences present.

https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm

The paper published about the algorithm can be found here.

http://dornsife.usc.edu/assets/sites/516/docs/papers/msw_papers/msw-042.pdf

Some things need to be noted to translate the algorithm into code. On page 196, just above the matrix, there is this formula.

W = 1.O+ 1/3 * k.

This formula is not correct and should be written

W = 1.O+ k * 1/3.

Following is code

void DNASequence()

{

const char* A = "0CAGCCUCGCUUAG";

const char* B = "0AAUGCCAUUGACGG";

int lenA = static_cast<int>(strlen(A));

int lenB = static_cast<int>(strlen(B));

Mat H = Mat(lenB, lenA, CV_32F, Scalar(0));

uchar *Hptr = H.data;

int sf = sizeof(float);

float f1, f2, f3;

float f4 = 0;

float fti, ftj;

for (int j = 1; j < lenB; j++)

{

for (int i = 1; i < lenA; i++)

{

// H(j, 0) and H(0, i) already initialized to 0

// f1 = H.at<float>(j - 1, i - 1) + similarity(A[i], B[j]);

f1 = *((float *)(Hptr + (i - 1) * sf + (j - 1) * H.step)) + similarity(A[i], B[j]);

// Calculate Hi-k,j-Wk

f2 = 0.0f;

if (i > 0)

{

for (int k = 1; k != i; k++)

{

// fti = H.at<float>(j, i - k) - weight(k);

fti = *((float *)(Hptr + (i - k) * sf + j * H.step)) - weight(k);

if (fti > f2)

f2 = fti;

}

// Calculate Hi,j-l-Wl

f3 = 0.0f;

if (j > 0)

{

for (int l = 1; l != j; l++)

{

// ftj = H.at<float>(j - l, i) - weight(l);

ftj = *((float *)(Hptr + i * sf + (j - l) * H.step)) - weight(l);

if (ftj > f3)

f3 = ftj;

}

f4 = 0;

// H.at<float>(j, i) = returnMax(f1, f2, f3, f4);

*((float *)(Hptr + i * sf + j * H.step)) = returnMax(f1, f2, f3, f4);

}

cout << "H:" << endl ;

displayMat(&H, A, B);

// Traceback

double minVal, maxVal;

Point minLoc, maxLoc;

minMaxLoc(H, NULL, &maxVal, NULL, &maxLoc);

string Astr = "";

string Bstr = "";

float elem = static_cast<float>(maxVal);

float pelem;

Point elemc = maxLoc;

Point pelemc;

int count = 0;

bool cont = true;

while (cont)

{

if (elemc.x == 0)

break;

Astr = A[elemc.x] + Astr;

Bstr = B[elemc.y] + Bstr;

pelemc.x = elemc.x;

pelemc.y = elemc.y;

pelem = elem;

elemc = nearestColMax(&H, elemc.x - 1, elemc);

elem = H.at<float>(elemc.y, elemc.x);

if (H.at<float>(pelemc.y - 1, pelemc.x - 1) < FLOAT_THRESHOLD)

{

cont = false;

}

else

{

if (pelemc.y > elemc.y)

{

// deleted sequence

count = pelemc.y - elemc.y - 1;

for (int i = 0; i < count; i++)

{

Astr = "-" + Astr;

Bstr = B[elemc.y + 1 - i] + Bstr;

}

else if (pelemc.y < elemc.y)

{

// disregard nearestColMax and return to diagonal which is non-zero > FLOAT_THRESHOLD

elemc.x = pelemc.x - 1;

elemc.y = pelemc.y - 1;

}

Code includes more than just the algorithm, Included is also a traceback portion of the code that attempts to output the query sequence A and the reference sequence B. This routine is not completely debugged but can output the example in the paper. An assumption of this routine is that if the max of the column appears below the current column, it is ignored.

Also note that I needed to add a "0" to the start of the sequence in order for the indices i and j to conform with the notation in the paper. The called subroutines like weight and similarity are more easily figured out and so are left out.