This neural network type uses a fast transform, eg. the fast Walsh Hadamard transform as:
A fixed filter bank.
A fast fixed synthetic weight matrix.
A one-to-all connectionist algorithm. A change in one input alters all the outputs.
The full frequency (sequency) filtering action of the Walsh Hadamard transform is not really needed or wanted at the input and output layers of the neural network. That behavior is tempered down with quasi-random sign flips applied to the data.
Of course there is nothing to adjust in a standard predetermined fast transform.
The solution is to use parametric activation functions that are individually adjustable.
The version of SwitchNet shown here uses a weighted version of CReLU (2 sided ReLU) for the parametric activation functions.
final class SWNet {
final int vecLen; // Array length of the input and output vectors. Must be an integer power of 2 like 16,32,64.....
final int widthBy; // To allow information to flow through each layer with minimal loss the internal width of each layer must be increased. Weighted ReLU would completely block information flow about 50% of the time and small magnitude weights would cause some further loss of information. Weighted CReLU causes the loss of a bit (sign bit) of information flowing through if the 2 parametric weights have a different (+-) sign. And also have the small magnitude weight problem.
The information loss problem can be fixed by making the layers wider to provide multiple routes for information to flow through.
final int depth; // The number of layers in the neural network.
final float[] params; // Array of parameters for the neural network
final float[] surface; // Stores the output of each layer. Required for back-propagation (training).
final float[] work; // Temporary work array.
final float[] flipsIn; // Quasi-random sign flips (+1,-1) applied to the input vector to ensure the input information is well spread out by the first fast Walsh Hadmard transform in the network, rather than being concentrated into a few dimensions by the natural frequency (sequency) responses of a fast transform.
final float[] flipsOut; // Likewise for the output of the neural network.
float rate; // Learning rate for training the neural network.
SWNet(int vecLen, int widthBy, int depth, float rate) { // Object constructor.
this.vecLen = vecLen; // Input and output vector array length. Integer power of 2 such as 16,32,64,128,......
this.widthBy=widthBy; // Increase the width to allow full information flow through each layer. Must be 1,2,4,8,16....to match the requirements of the fast transform.
this.depth = depth; // The number of layers in the neural network.
this.rate=rate; // The training rate.
params = new float[2*vecLen*widthBy*depth]; // 2 weight parameters are needed for each weighted CReLU activation function in the neural network. vecLen*widthBy is the actual width of the networks and depth the number of layers.
surface=new float[vecLen*widthBy*depth]; // Stores the output of each layer except the last for the back-propagation step during training.
work=new float[vecLen*widthBy]; // Temprary work vector of the internal width of the network.
flipsIn=new float[vecLen*widthBy]; // Quasi-random input vector sign flips.
flipsOut=new float[vecLen]; // Quasi-random output vector sign flips.
for (int i = 0; i < params.length; i++) { // For all the parameters
params[i] = randomGaussian(); // give each one a random value from the standard Normal random distribution.
}
int r = 0x9E3779B9; // Initial fixed seed value for a simple Linear Congruent (LCG) random number generator.
for (int i=0; i<vecLen*widthBy; i++) { // For each element in flipsIn[]
r=r*1664525+1013904223; // Numerical recipes LCG random number generator
flipsIn[i] = (r & 1<<16) == 0 ? -1f : 1f; // Pick +1 or -1 based on bit 16 of r which is somewhat random, higher bits are more random (choose from bits 0 to 31.)
}
for (int i=0; i<vecLen; i++) {
r=r*1664525+1013904223;
flipsOut[i] = (r & 1<<16) == 0 ? -1f : 1f; // Pick +1 or -1 based on bit 16 of r, for the output sign flip array.
}
} // End of constructor.
void recall(float[] result, float[] input) { // Recall an output result vector for a given input vector.
final int n=vecLen*widthBy; // Actual internal width of the neural network.
for (int i=0; i<n; i++) { // Quasi-random sign flip the input data while also (copy) expanding to the internal width of the neural network. Put the result in the work array.
work[i]=flipsIn[i]*input[i&(vecLen-1)]; //vecLen-1 acts as a binary mask, causing the array index to wrap around back to zero rather than get too large.
}
whtN(work); // Input fast Walsh Hadamard transform, scaled to leave the input vector magnitude unchanged.
for (int i = 0, paIdx=0; i < depth; i++) { // For each layer in the neural network. PaIdx is the parameter index.
for (int j = 0; j < n; j ++, paIdx+=2) { // For each element in the work array calculate a weighted (parametric) CReLU activation function response and then put that response back it the work array at the same position.
if (work[j] < 0f) {
work[j] *=params[paIdx]; // Select which weight (of 2) to multiply work[j] with depending on sign (+ or -)
} else {
work[j] *=params[paIdx+1];
}
}
whtN(work); // Connect together all the parametric activation function responses using the one-to-all behavior of
the fast transform. Where a change in one input alters all the outputs.
}
for (int i=0; i<vecLen; i++) { // Copy part of the wide internal vector to the output vector and include quasi-random sign flips to mostly remove the spectral bias of the final fast transform.
result[i]=work[i]*flipsOut[i];
}
}
void train(float[] target, float[] input) {
final int n=vecLen*widthBy; // Forward pass. The same a during recall except the output of each layer except the last is retained in the surface array to allow back-propagation.
for (int i=0; i<n; i++) {
work[i]=flipsIn[i]*input[i&(vecLen-1)];
}
whtN(work);
for (int i = 0, paIdx=0; i < depth; i++) {
System.arraycopy(work, 0, surface, i*n, n); // Copy the layer output to the surface array.
for (int j = 0; j < n; j ++, paIdx+=2) {
if (work[j] < 0f) {
work[j] *=params[paIdx];
} else {
work[j] *=params[paIdx+1];
}
}
whtN(work);
}
for (int i=0; i<vecLen; i++) {
work[i]=work[i]*flipsOut[i];
}
for (int i=0; i<vecLen; i++) { Start of Backward pass.
work[i]=(target[i]-work[i])*rate; //Calculate the error vector and scale by the learning rate.
}
for (int i=0; i<vecLen; i++) {
work[i]=work[i]*flipsOut[i]; // Undo the output vector sign flips, only for the array length of the output vector
}
for (int i=vecLen; i<n; i++) { // Clear the upper part of the internal array with zeros. The one-to-all connectivity of the WHT will spread the error out to the full internal width later.
work[i]=0f;
}
for (int i = depth-1; i >=0; i--) {// For each layer working backward.
whtN(work); // invert error back
for (int j = 0, surIdx=i*n, paIdx=2*i*n; j < n; j ++, paIdx+=2) { // paIdx is the parameter index and surIdx is the surface index. For each stored layer element working backward layer by layer.
float x=surface[surIdx+j]; // Get the value of the stored layer element.
if (x < 0f) { // Check if is positive or negative to see which weight parameter to update.
params[paIdx]+=x*work[j]; // Adjust the weight parameter to reduce the recall error of the network.
work[j] *=params[paIdx]; // Send back the error through the weight
} else {
params[paIdx+1]+=x*work[j]; // paIdx+1 is the weight to adjust if the layer output element is positive.
work[j] *=params[paIdx+1]; // and the weight to send the error back through.
}
}
}
}
}
// Fast Walsh Hadamard Transform (normalized for scale.)
void whtN(float[] vec) {
final int n = vec.length;
int hs = 1;
while (hs < n) {
int i = 0;
while (i < n) {
final int j = i + hs; // final here is good hint to hotspot
while (i < j) {
float a = vec[i];
float b = vec[i + hs];
vec[i] = a + b;
vec[i + hs] = a - b;
i += 1;
}
i += hs;
}
hs += hs;
}
float sc=1f/sqrt(n); // scale to leave overall vector magnitude unchanged
for (int i=0; i<n; i++) {
vec[i]*=sc;
}
}