Post date: Jun 30, 2013 10:10:22 AM
Problem
Given an array of ints, find the most frequent non-empty subarray in it. If there are more than one such sub-arrays return the longest one/s.
Note: Two subarrays are equal if they contain identical elements and elements are in the same order.
For example: if input = {4,5,6,8,3,1,4,5,6,3,1}
Result: {4,5,6}
Solution
Reference: Dumbo method
1. Build a suffix array and sort the array. Use 2 variables - one to maintain the length of the longest repeated sub array and the other to maintain the frequency.
2. Traverse the sorted array to find out the most occurring and longest repeated subarray and return it.
Suffix array is actually a 2D array. The suffix array for the given array {4,5,6,8,3,1,4,5,6,3,1} would be as below. Here, each element of the array itself is an array.
{4,5,6,8,3,1,4,5,6,3,1}
{5,6,8,3,1,4,5,6,3,1}
{6,8,3,1,4,5,6,3,1}
{8,3,1,4,5,6,3,1}
{3,1,4,5,6,3,1}
{1,4,5,6,3,1}
{4,5,6,3,1}
{5,6,3,1}
{6,3,1}
{3,1}
{1}
After sorting the suffix array, you'd get:
{8,3,1,4,5,6,3,1}
{6,8,3,1,4,5,6,3,1}
{6,3,1}
{5,6,8,3,1,4,5,6,3,1}
{5,6,3,1}
{4,5,6,8,3,1,4,5,6,3,1}
{4,5,6,3,1}
{3,1,4,5,6,3,1}
{3,1}
{1,4,5,6,3,1}
{1}
Checking for matching subarrays is easily done in a suffix array by comparing the prefixes.
If you traverse the above sorted array and
compare adjacent elements for similarity you'd see the prefix [4,5,6] is occurring maximum number(=2)
of times and is also of maximum length.
There are other subarrays as well, like [6], [5,6],[3,1] and [1] that are occurring 2 times, but they are shorter than the subarray [4,5,6], which is our required answer. HTH.
/*
============================================================================
Author : James Chen
Email : a.james.chen@gmail.com
Description : Find the most longest or most longest and frequent subarray
Created Date : 30-Jun-2013
Last Modified :
============================================================================
*/
#include <iostream>
#include <iomanip>
#include <vector>
#include <algorithm>
#include <iterator>
using namespace std;
void FindMostFreqLongestSubArray(int* arr, int len )
{
if(len == 1) {
cout << "No repeated subarray " << endl;
return;
}
vector<vector<int>> suffixArray;
for(int i = 0; i < len; ++i){
vector<int> v;
for(int j = len - 1 - i; j <= len - 1; ++j){
v.push_back(arr[j]);
}
suffixArray.push_back(v);
}
sort(suffixArray.begin(), suffixArray.end(),
[](vector<int> v1, vector<int> v2) -> bool{
int minSize = (v1.size() < v2.size())? v1.size() : v2.size();
for(int i = 0; i < minSize; ++i){
int j = 0;
while(j < minSize && v1[j] == v2[j]) j++;
if(j < minSize) return v1[j] < v2[j];
if(minSize = j && v1.size() == minSize) return true;
return false;
}
});
// Find longest one in every group
int longestSize(0);
for(int i = 0; i < suffixArray.size() - 1; ++i){
int j = 0;
int minSize = (suffixArray[i].size() < suffixArray[i + 1].size())?
suffixArray[i].size() : suffixArray[i + 1].size();
while(j < minSize && (suffixArray[i])[j] == (suffixArray[i + 1])[j]) j++;
if(longestSize < j) longestSize = j;
}
if(longestSize == 0) {
cout << "No repeated subarray" << endl;
return;
}
cout << "The longest repeated size is " << longestSize << endl;
int mostFreq(0);
int mostFreqStartIndex(-1);
int freq(0);
int startIndex(-1);
for(int i = 0; i < suffixArray.size() - 1; ++i){
if(suffixArray[i].size() < longestSize ||
suffixArray[i + 1].size() < longestSize){
freq = 0;
startIndex = i + 1;
continue;
}
int j(0);
while(j < longestSize && (suffixArray[i])[j] == (suffixArray[i + 1])[j]) j++;
if(j == longestSize){
freq ++;
if(freq > mostFreq){
mostFreq = freq;
mostFreqStartIndex = startIndex;
}
}
else{
freq = 0;
startIndex = i + 1;
}
}
++ mostFreq;
cout << "The sub sequence are : " << endl;
for(int i = 0; i < longestSize; ++i){
cout << left << setw(5) << suffixArray[mostFreqStartIndex][i];
}
cout << endl;
cout << "The repeated time is " << mostFreq << endl;
}
void DoTest(int* arr, int len)
{
cout << "The array is " << endl;
copy(arr, arr + len, ostream_iterator<int>(cout, " "));
cout << endl;
FindMostFreqLongestSubArray(arr, len);
cout << "-------------------------" << endl;
}
int main(int argc, char* argv[])
{
int arr[] = {4,5,6,8,3,1,4,5,6,3,1};
DoTest(arr, sizeof(arr)/sizeof(arr[0]));
int arr1[] = {4,5,6,7, 8,3,1,4,5,6,3,1, 0, 4, 5, 6, 5, 8, 3, 1, 6, 8, 3,1, 2, 8, 3,1, 8, 3, 1};
DoTest(arr1, sizeof(arr1)/sizeof(arr1[0]));
int arr2[] = {4, 5, 6, 7, 8};
DoTest(arr2, sizeof(arr2)/sizeof(arr2[0]));
int arr3[] = {4, 5, 6, 7, 8, 8};
DoTest(arr3, sizeof(arr3)/sizeof(arr3[0]));
int arr4[] = {4};
DoTest(arr4, sizeof(arr4)/sizeof(arr4[0]));
return 0;
}
Output
The array is
4 5 6 8 3 1 4 5 6 3 1
The longest repeated size is 3
The sub sequence are :
4 5 6
The repeated time is 2
-------------------------
The array is
4 5 6 7 8 3 1 4 5 6 3 1 0 4 5 6 5 8 3 1 6 8 3 1 2 8 3 1 8 3 1
The longest repeated size is 3
The sub sequence are :
8 3 1
The repeated time is 5
-------------------------
The array is
4 5 6 7 8
No repeated subarray
-------------------------
The array is
4 5 6 7 8 8
The longest repeated size is 1
The sub sequence are :
8
The repeated time is 2
-------------------------
The array is
4
No repeated subarray
-------------------------
Press any key to continue . . .