Post date: Jul 01, 2017 9:39:8 AM
The Smith Waterman algorithm matches two sequences to find common sub-sequences present.
https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
The paper published about the algorithm can be found here.
http://dornsife.usc.edu/assets/sites/516/docs/papers/msw_papers/msw-042.pdf
Some things need to be noted to translate the algorithm into code. On page 196, just above the matrix, there is this formula.
W = 1.O+ 1/3 * k.
This formula is not correct and should be written
W = 1.O+ k * 1/3.
Following is code
void DNASequence()
{
const char* A = "0CAGCCUCGCUUAG";
const char* B = "0AAUGCCAUUGACGG";
int lenA = static_cast<int>(strlen(A));
int lenB = static_cast<int>(strlen(B));
Mat H = Mat(lenB, lenA, CV_32F, Scalar(0));
uchar *Hptr = H.data;
int sf = sizeof(float);
float f1, f2, f3;
float f4 = 0;
float fti, ftj;
for (int j = 1; j < lenB; j++)
{
for (int i = 1; i < lenA; i++)
{
// H(j, 0) and H(0, i) already initialized to 0
// f1 = H.at<float>(j - 1, i - 1) + similarity(A[i], B[j]);
f1 = *((float *)(Hptr + (i - 1) * sf + (j - 1) * H.step)) + similarity(A[i], B[j]);
// Calculate Hi-k,j-Wk
f2 = 0.0f;
if (i > 0)
{
for (int k = 1; k != i; k++)
{
// fti = H.at<float>(j, i - k) - weight(k);
fti = *((float *)(Hptr + (i - k) * sf + j * H.step)) - weight(k);
if (fti > f2)
f2 = fti;
}
}
// Calculate Hi,j-l-Wl
f3 = 0.0f;
if (j > 0)
{
for (int l = 1; l != j; l++)
{
// ftj = H.at<float>(j - l, i) - weight(l);
ftj = *((float *)(Hptr + i * sf + (j - l) * H.step)) - weight(l);
if (ftj > f3)
f3 = ftj;
}
}
f4 = 0;
// H.at<float>(j, i) = returnMax(f1, f2, f3, f4);
*((float *)(Hptr + i * sf + j * H.step)) = returnMax(f1, f2, f3, f4);
}
}
cout << "H:" << endl ;
displayMat(&H, A, B);
// Traceback
double minVal, maxVal;
Point minLoc, maxLoc;
minMaxLoc(H, NULL, &maxVal, NULL, &maxLoc);
string Astr = "";
string Bstr = "";
float elem = static_cast<float>(maxVal);
float pelem;
Point elemc = maxLoc;
Point pelemc;
int count = 0;
bool cont = true;
while (cont)
{
if (elemc.x == 0)
break;
Astr = A[elemc.x] + Astr;
Bstr = B[elemc.y] + Bstr;
pelemc.x = elemc.x;
pelemc.y = elemc.y;
pelem = elem;
elemc = nearestColMax(&H, elemc.x - 1, elemc);
elem = H.at<float>(elemc.y, elemc.x);
if (H.at<float>(pelemc.y - 1, pelemc.x - 1) < FLOAT_THRESHOLD)
{
cont = false;
}
else
{
if (pelemc.y > elemc.y)
{
// deleted sequence
count = pelemc.y - elemc.y - 1;
for (int i = 0; i < count; i++)
{
Astr = "-" + Astr;
Bstr = B[elemc.y + 1 - i] + Bstr;
}
}
else if (pelemc.y < elemc.y)
{
// disregard nearestColMax and return to diagonal which is non-zero > FLOAT_THRESHOLD
elemc.x = pelemc.x - 1;
elemc.y = pelemc.y - 1;
}
}
}
}
Code includes more than just the algorithm, Included is also a traceback portion of the code that attempts to output the query sequence A and the reference sequence B. This routine is not completely debugged but can output the example in the paper. An assumption of this routine is that if the max of the column appears below the current column, it is ignored.
Also note that I needed to add a "0" to the start of the sequence in order for the indices i and j to conform with the notation in the paper. The called subroutines like weight and similarity are more easily figured out and so are left out.