Bayes

This is a C# implementation of naive Bayes classifier, based on http://en.wikipedia.org/wiki/Naive_Bayes_classifier.

I used bell() function with normalized values, should work the same as in Wikipedia. Maybe it would be good to somehow normalize things and calculate probabilities instead of "probability density" values so that it would not be necessary to multiply many small numbers which could result in zero values that cause problems. Also the classifier should somehow handle zero values.

The example gives 0.00000015442866782106887 probability for male and 0.99999984557133215 for female.

I used two dimensional value_tab for feeding in data for the algorithm. It has one row for each sample. The first column is class number starting from zero and rest are values for features. So the C# dimension for value_tab is [n_rows,n_features+1]. Sample to be tested is in the same format as value tab row, but without the first class number value (that is what we want to know).

    public class bayes_test
    {
        public void test()
        {
            double[,] value_tab = new double [,]
            { 
                { 1, 6.00, 180, 12 }, 
                { 1, 5.92, 190, 11 }, 
                { 1, 5.58, 170, 12 }, 
                { 1, 5.92, 165, 10 }, 
                { 0, 5.00, 100, 6  }, 
                { 0, 5.50, 150, 8  }, 
                { 0, 5.42, 130, 7  }, 
                { 0, 5.75, 150, 9  }, 
            };
            bayes_classifier bc = new bayes_classifier(2, value_tab);
            double[] sample = { 6,130,8};
            double p1 = bc.calc_prob(1, sample);
            double p2 = bc.calc_prob(0, sample);
            double prob1 = p1 / (p1 + p2);
            double prob2 = p2 / (p1 + p2);
        }
    }

    public class bayes_classifier
    {
        public double[,] avg;
        public double[,] std_dev;
        public int n_classes;
        public int n_rows;
        public int n_features;
        double[] p_class;

        public bayes_classifier(int p_n_classes, double[,] value_tab)
        {
            n_classes = p_n_classes;
            n_rows = value_tab.GetLength(0);
            n_features = value_tab.GetLength(1)-1;
            avg = new double[n_classes, n_features];
            std_dev = new double[n_classes, n_features];
            for (int n_class = 0; n_class < n_classes; n_class++)
            {
                calc_stats(n_class, value_tab);
            }
            calc_p_classes(value_tab);
        }

        double bell(double x)
        {
            return 1.0 / Math.Sqrt(2 * Math.PI) * Math.Exp(-0.5 * x * x);
        }

        public double p_fun(int class_num, int feature, double value)
        {
            double avg1 = value - avg[class_num, feature];
            double std_dev1 = std_dev[class_num, feature];
            double bell_arg = avg1 / std_dev1;
            double prob = 1 / std_dev1 * bell(bell_arg);
            
            return prob;
        }
        void calc_p_classes(double[,] value_tab)
        {
            p_class = new double[n_classes];
            double []sum_class_row = new double[n_classes];

            for (int row = 0; row < n_rows; row++)
            {
                int n_class = (int)value_tab[row,0];
                sum_class_row[n_class]++;
            }

            for (int c = 0; c < n_classes; c++)
            {
                p_class[c] = sum_class_row[c] / n_rows;
            }
        }
        public double calc_prob(int n_class, double[] sample)
        {
            double prob = p_class[n_class];
            for (int feature = 0; feature < n_features; feature++)
            {
                prob *= p_fun(n_class, feature, sample[feature]);
            }
            return prob;
        }

        public void calc_stats(int class_num, double[,] value_tab)
        {
            int n_rows = value_tab.GetLength(0);
            int n_cols = value_tab.GetLength(1)-1;
            double[] sum = new double[n_cols];
            double[] diff_sum2 = new double[n_cols];
            int match_class_rows = 0;

            for (int row = 0; row < n_rows; row++)
            {
                if (value_tab[row, 0] == class_num)
                {
                    match_class_rows++;
                    for (int col = 0; col < n_cols; col++)
                    {
                        double x = value_tab[row, col + 1];
                        sum[col] += x;
                    }
                }
            }
            for (int col = 0; col < n_cols; col++)
            {
                if (match_class_rows > 0)
                {
                    avg[class_num, col] = sum[col] / match_class_rows;
                }
            }
            for (int row = 0; row < n_rows; row++)
            {
                if (value_tab[row, 0] == class_num)
                {
                    for (int col = 0; col < n_cols; col++)
                    {
                        double x = value_tab[row, col+1];
                        double diff = x - avg[class_num, col];
                        diff_sum2[col] += diff * diff;
                    }
                }
            }
            for (int col = 0; col < n_cols; col++)
            {
                if (match_class_rows > 0)
                {
                    std_dev[class_num, col] = Math.Sqrt(diff_sum2[col] / match_class_rows);
                }
            }
        }
    }
Comments