DBPredictor

http://www.cs.sfu.ca/~melli/DBPredictor/

/*********************************************************************

**********************************************************************

** DEFINED DEFAULTS

**********************************************************************

*********************************************************************/

#define        MAX_ATTRIBUTES        81

#define        TOKEN              256

#define     STRING              2056

#define     MAX_DOMAIN_SIZE       64

#define     NOMINAL_STRING     "nominal"

#define     ORDINAL_STRING     "ordinal"

#define     CONTINUOUS_STRING  "continuous"

#define     NOMINAL                0

#define     ORDINAL                1

#define     CONTINUOUS             2

#define     MAX_ROWS           200000 /* for in memory (signed short int) */

#define     NOMINAL_NA            -1

#define     CONTINUOUS_NA       -999 /* as per C4.5 */

#define     WRONG_STRING       "WRONG"

#define     CORRECT_STRING     "CORRECT"

#define     MIN                    0

#define     MAX                    1

/*********************************************************************

**********************************************************************

** GLOBAL STRUCTURES

**********************************************************************

*********************************************************************/

typedef union

{

    char  nominal ;       /* id of the attribute member */

    float continuous ;

} attr_val ;

typedef union

{

    char  nominal ;       /* id of the attribute member */

    float min_max[2] ;

} term ;

/* Rule structure */

typedef struct classifier_str {

    term   Terms[MAX_ATTRIBUTES] ;

    char   attribute_map[MAX_ATTRIBUTES]; /* degree of conj on this attr */

    char   antecedent_order[MAX_ATTRIBUTES]; /* order of specialization for this rule */

    int    a_size ;           /* number of conjunctions (ANDs) in this rule */

    float  value ;             /* rule quality value */

    int       cover ;            /* rule cover BUG - change name to cover */

    char   Rule[STRING] ; /* text based rule (SQL WHERE clause) */

    float  conseq_ratios[MAX_DOMAIN_SIZE] ;

    void   *db_priv ;         /* pointer available for the private use

                 of the db functions. e.g. to pass info

                 from create_rule to specialize_rule. */

} rule_type ;

typedef struct attribute_str

{

    char  name[TOKEN];

    short type ;

    short size ;

    /* BUG: a union would be more efficient for these two */

    /* BUG: domain members should be in alpha ascending order */

    char  members[MAX_DOMAIN_SIZE][TOKEN] ; /* for nominal */

    float   min ; /* for ordinal and continuous */

    float   max ; /* for ordinal and continuous */

   

} attribute_type ;

typedef struct domain_str {

    short attributes ;    /* number of attribute in DB_Table */

    short resp_attr ;

    attribute_type Datadict[MAX_ATTRIBUTES] ;

    long  N ;    /* number of records in DB_Table */

    /* char  Response_Attribute[MAX_STRING] ;

       char  Resp_Attr_Dom[MAX_ATTRIBUTES][MAX_STRING] ;

       short resp_attr_dom_size ; */

} domain_type ;

/*********************************************************************

**********************************************************************

** FORWARD DECLARATION

**********************************************************************

*********************************************************************/

extern int init_db (

    void **, char *, domain_type

) ;

extern int create_rule (

    void *, domain_type *, rule_type *

) ;

extern int specialize_rule (

    void *, domain_type *, rule_type *, int

) ;

extern int clear_rule (

    void *

) ;

extern int db_exit (

    void *

) ;

/* make this flag available to the other objects */

extern int debug ;

////////////////////////////////////////////////////////////////////////////////////////////

#define        AUTHOR        "Gabor_Melli@cs.sfu.ca"

#define        DATE        "1998/03/28"    /* string */

#define        VERSION        2.0        /* real number */

/*****************************************************************

** Attribute-Value Prediction (DBPredictor) program        **

**                                **

** (c) 1998 Gabor Melli                     **

** School of Computing Science / Simon Fraser University    **

**                                **

** This code is freely available for personal or academic use.  **

** Express written consent from the author is required to use    **

** this program in a commercial setting.            **

**                                **

** Comments and suggestions are very welcome!            **

**                                **

** for further information please refer to            **

** http://www.cs.sfu.ca/~melli/DBPredictor                      **

**                                **

******************************************************************

**                                **

**                                **

** EXIT STATUS:                            **

**     0:    successful                    **

**    -1:    unsuccessful                    **

**                                **

** NEW                                **

**                                **

** TO DO: http://.../DBPredictor/to_do.html            **

**                                **

** QUESTIONS: http://.../DBPredictor/questions.html        **

**                                **

** BUGS:                            **

**                                **

*****************************************************************/

/*****************************************************************

** PROGRAM STRUCTURE                        **

**                                **

** INCLUDE FILES                        **

** DEFINED DEFAULTS                        **

** SYNTAX MESSAGE                        **

** GLOBAL STRUCTURES                        **

** GLOBAL VARIABLES                        **

** FORWARD DECLARATION                        **

** MAIN                                **

**    - Process the Program Parameters            **

**    - Test the Selected Parameter Settings            **

**    - Report of the Variable Settings (when verbose)    **

**      - PEFORM PREDICTION                                     **

**                                **

** SUPPORT PROCEDURES                        **

** init_db()                            **

** create_overall()                        **

** create_rule()                        **

** print_rule()                            **

** test_rule()                            **

** C_SEP_measure()                            **

** ratio_compare()                        **

** err_handler()                        **

** msg_handler()                        **

*****************************************************************/

/*****************************************************************

******************************************************************

** INCLUDE FILES                        **

******************************************************************

*****************************************************************/

#include <stdio.h>

#include <math.h>       /* sqrt(), ceil() */

#include <stdlib.h>    /* atoi() */

#include <string.h>     /* bcopy(), bzero() */

#include <unistd.h>    /* getopt() */

#include <time.h>    /* time() */

#include "dbpredictor.h"  /* typedefs and globals */

/*********************************************************************

**********************************************************************

** LOCAL DEFINITIONS

**********************************************************************

*********************************************************************/

/* determined through empirical study */

#define   NUM_PART_RATIO  2.0  /* def. numerical partition ratio */

#define   MIN_COVER       1    /* def. smallest rule support considered */

#define   MIN_MEASURE     0.0  /* def. smallest interestingness considered */

#define   DEF_EVAL_FN     5    /* def. evaluation function */

#define   LOG10_2         0.301029996

#define   SQRT_2          1.414213562

#define   NULL_RULE_DESC   "Overall Distribution"

/*********************************************************************

**********************************************************************

** GLOBAL STRUCTURES AND VARIABLES

**********************************************************************

*********************************************************************/

struct test { /* used to temporarily sort the consequent distribution */

    int    index ;

    double    ratio ;

} ;

int debug=0;

static int verbose=0;

static time_t time_1,time_2 ; /* help to report duration between preds */

/*********************************************************************

**********************************************************************

** SYNTAX MESSAGE

**********************************************************************

*********************************************************************/

#define  USAGE  \

fprintf(stderr, "\nSYNTAX: %s [-bdhisv] -tDFR option\n\n", argv[0])   ; \

fprintf(stderr, "\td:\tdebug msgs to stderr\n")            ; \

fprintf(stderr, "\th:\thelp\n")                    ; \

fprintf(stderr, "\ti:\tinteractive (one at a time vs batch)\n")    ; \

fprintf(stderr, "\tv:\tverbose\n")                ; \

fprintf(stderr, "\n")                        ; \

fprintf(stderr, "\tt:\tX (report accuracy within X)\n")            ; \

fprintf(stderr, "\tD:\tDatabase Table\n")            ; \

fprintf(stderr, "\tf:\tFilestem\n")                ; \

fprintf(stderr, "\tR:\tResponse attribute to predict\n")    ; \

fprintf(stderr, "\te:\tevaluation function [%d]\n", DEF_EVAL_FN)        ; \

fprintf(stderr, "\tm:\tmin heuristic measure [%g]\n", MIN_MEASURE) ; \

fprintf(stderr, "\tc:\tmin cover [%d]\n", MIN_COVER)            ; \

fprintf(stderr, "\tn:\tnumerical attr div ratio [%g]\n", NUM_PART_RATIO) ; \

fprintf(stderr, "\n")                        ; \

fprintf(stderr, "\tExample\n")                    ; \

fprintf(stderr, "\t%s -f iris -Rclass -v\n",argv[0])        ; \

fprintf(stderr, "\n")                        ;

/*********************************************************************

**********************************************************************

** FORWARD DECLARATION

**********************************************************************

*********************************************************************/

extern void bzero(void *, int) ;

extern void bcopy (const void *, void *, int) ;

extern int  getopt(int, char * const [], const char *) ;

extern char *strtok(char *, const char *) ;

static void test_rule(

    FILE *, struct domain_str *, struct classifier_str *, char *, int ) ;

static void print_rule(

    FILE *, struct domain_str, struct classifier_str * ) ;

static int eval_fn(

    struct classifier_str *, struct classifier_str *, struct domain_str, int ) ;

   

/* static int ratio_compare (struct test *, struct test * ) ; */

static int ratio_compare (const void *, const void *) ;

static void set_rule_description (rule_type *, domain_type) ;

/*****************************************************************

******************************************************************

** MAIN                                **

******************************************************************

*****************************************************************/

int main(int argc, char *argv[])

{

    FILE *datadict_file=NULL ;

    FILE *test_file=NULL ;

    FILE *output_file=NULL;

    char Filestem[100] ;

    char Datadict_filename[256] ;

    char Output_filename[256] ;

 

    char Tests_filename[256] ;

    char DB_Table_Name[256] ;

    char Response_Attribute[TOKEN] = {0} ;

    int   min_cover      = MIN_COVER ;

    float min_measure    = MIN_MEASURE ;

    float num_part_ratio = NUM_PART_RATIO ;

    int   eval_function  = DEF_EVAL_FN ;

    int  i ;                       /* counter */

    int  interactive=1 ;

    char datadict_string[STRING] ;

    char test_string[STRING] ;     /* holds event vector values */

    char test_accuracy=1 ;         /* default: best class accuracy */

    void *DB_ptr;

    int  test_counter=0 ;

    domain_type Domain_Info ;

    rule_type    Max_Rule ;

    /***********************************************

    ** INITIALIZE

    ***********************************************/

    DB_Table_Name[0]      = (char)NULL ;

    Filestem[0]           = (char)NULL ;

    Output_filename[0]    = (char)NULL ;

    Response_Attribute[0] = (char)NULL ;

    /***********************************************

    ** DECOMPOSE THE PROGRAM ARGUMENTS

    ***********************************************/

    {

    char c ;        /* parameter flag character */

    extern char *optarg ;

   

    while ((c = getopt(argc, argv, "dhisvt:f:D:R:n:c:m:e:")) != -1)

        switch(c)

        {

        case 'd': /* Debug */

        debug=1 ;

        verbose=1 ;

        break ;

        case 'h': /* Simple help message */

        USAGE ;

        exit(-1) ;

       

        case 'i': /* Interactive instead of batch */

        interactive=1 ;

        break ;

       

        case 't':  /* Test accuracy instead of report rules */

        test_accuracy=atoi(optarg) ;

        break ;

       

        case 'v': /* Verbose */

        verbose=1 ;

        break ;

       

        case 'f':  /* file prefix */

        strcpy(Filestem, optarg) ;

        break ;

       

        case 'D':  /* Database table */

        strcpy(DB_Table_Name, optarg) ;

        break ;

       

        case 'R':  /* Response attribute to predict */

        strcpy(Response_Attribute, optarg) ;

        break ;

       

        case 'c':  /* Minimum cover parameter */

        sscanf(optarg, "%d", &min_cover) ;

        break ;

       

        case 'm':  /* Minimum measure */

        sscanf(optarg, "%f", &min_measure) ;

        break ;

       

        case 'n':  /* Response attribute to predict */

        sscanf(optarg, "%f", &num_part_ratio) ;

        break ;

       

        case 'e':  /* Evaluation function */

        if (!strcmp(optarg,"cos"))

            eval_function = 1;

        else if(!strcmp(optarg,"dist"))

            eval_function = 2;

        else if(!strcmp(optarg,"entropy"))

            eval_function = 3;

        else if(!strcmp(optarg,"cos2"))

            eval_function = 4;

        else if(!strcmp(optarg,"dist2"))

            eval_function = 5;

        else if(!strcmp(optarg,"entropy2"))

            eval_function = 6;

        else {

            fprintf(stderr,"ERROR: Invalid parameter -e %s\n", optarg);

            exit(-1);

        }

        break ;

        }

    }

    /* Report the version information in verbose mode */

    if (verbose) fprintf(stdout,

             "v%4.2f(%s)\n - Min Measure %g\tMin Cover:%d\tNum Prun: %g\tEval Function:%d\n\n",

             VERSION, DATE, min_measure, min_cover, num_part_ratio, eval_function );

    /*****************************************************/

    /* TEST FOR APPROPRIATE VALUES & REQUIRED PARAMETERS */

    /*****************************************************/

    if (Response_Attribute == (char)NULL) {

    fprintf(stderr, "ERROR: Response attribute parameter (-R) not set\n");

    exit(-1);

    }

    if (Filestem[0] == (char)NULL) {

    fprintf(stderr, "ERROR: Filestem parameter (-f) not set\n");

    exit(-1);

    }

    /*

    if (test_accuracy < 1) {

    fprintf(stderr, "ERROR: test accuracy of [%d] is invalid\n",

        test_accuracy );

    exit(-1);   

    }

    */

    /***********************************************

    ** SETUP AND TEST I/O FILES

    ***********************************************/

#if DBMS==1

    if (debug) fprintf(stderr, "FILE-based dataset\n") ;

    if (DB_Table_Name[0] == (char)NULL) {

    sprintf(DB_Table_Name, "%s.data", Filestem) ;

    }

#elif DBMS==2

    if (debug) fprintf(stderr, "SYBASE DB-LIB based dataset\n") ;

    if (DB_Table_Name[0] == (char)NULL) {

    sprintf(DB_Table_Name, "%s", Filestem) ;

    }

#endif

    sprintf(Datadict_filename, "%s.dict", Filestem) ;

    if ((datadict_file = fopen(Datadict_filename, "r")) == NULL) {

    fprintf(stderr,

        "ERROR: Could not open the data dictionary file: [%s]\n",

        Datadict_filename);

    exit(-1);

    }

    sprintf(Tests_filename, "%s.test", Filestem) ;

    if ((test_file = fopen(Tests_filename, "r")) == NULL) {

    fprintf(stderr,

        "ERROR: Could not open the tests file: %s\n",

        Tests_filename);

    exit(-1);

    }

    if (interactive) { /* interactive sessions pipe the results to stdout */

        strcpy(Output_filename, "<stdout>") ;

    output_file = stdout ;

    }

    else {

    sprintf(Output_filename, "%s.output", Filestem) ;

    if ((output_file = fopen(Output_filename, "w")) == NULL) {

        fprintf(stderr, "ERROR: Could not open output file: %s\n",

            Output_filename);

        exit(-1);

    }

    }

    /***********************************************

    ** DEBUG

    ***********************************************/

    if (debug) {

    fprintf(stderr, "debug: \n") ;

    fprintf(stderr, "debug: This is a selection query\n") ;

    if (interactive) fprintf(stderr, "debug: Interactive session\n") ;

    else fprintf(stderr, "debug: Batch session\n") ;

    if (test_accuracy) fprintf(stderr, "debug: Test accuracy query\n") ;

    else fprintf(stderr, "debug: Report rule query\n") ;

    fprintf(stderr, "debug: DB_Table_Name       = [%s]\n", DB_Table_Name) ;

    fprintf(stderr, "debug: Response Attrribute = [%s]\n",

        Response_Attribute) ;

    fprintf(stderr, "debug: Data Dict. file     = [%s]\n",

        Datadict_filename) ;

    fprintf(stderr, "debug: Tests file          = [%s]\n",

        Tests_filename) ;

    fprintf(stderr, "debug: Output file         = [%s]\n",

        Output_filename) ;

    fprintf(stderr, "debug: \n") ;

    }

    /***********************************************

    ** READ IN THE DATA DICTIONARY INFORMATION

    ***********************************************/

    Domain_Info.attributes=0 ;

    fgets(datadict_string, STRING, datadict_file) ;

    while (!feof(datadict_file)) {

    char    *token ;

    char    attr_desc[MAX_ATTRIBUTES][TOKEN] ;

    float   min ;

    float   max ;

    int     tokens, i ;

   

        /* prune the ".\n" ending from the buffer if it exists */

    if ((token=strstr(datadict_string,".\n")) != NULL)

        *token = (char)NULL ;

    if ((token=strstr(datadict_string,"\n")) != NULL)

        *token = (char)NULL ;

    /* if (debug)fprintf(stderr, "debug: datadict_string %s\n", datadict_string); */

    /* Place this attributes tokens into an array */

    tokens=0 ;

        token = strtok(datadict_string, ":") ;

        strcpy(attr_desc[tokens], token) ;

    for (token=strtok(NULL, ","), i=1 ;

         token != NULL ;

         token=strtok(NULL, ","), i++ ) {

            /* get rid of any initial spaces */

        while(*token==' ' || *token=='\t') token++ ;

        strcpy(attr_desc[i], token) ;

        if(debug)fprintf(stderr, " [%s]\n", attr_desc[i]);

    }

   

        tokens=i ;

        if (tokens<3) {

        fprintf(stderr, "ERROR in %s at line %d: Information missing.\n",

            Datadict_filename, Domain_Info.attributes+1) ;

        exit(-1);

    }

    strcpy(Domain_Info.Datadict[Domain_Info.attributes].name,

            attr_desc[0]) ;

    if (!strcmp(NOMINAL_STRING, attr_desc[1] ) )

        Domain_Info.Datadict[Domain_Info.attributes].type = NOMINAL ;

        else if (!strcmp(ORDINAL_STRING, attr_desc[1] ))

        Domain_Info.Datadict[Domain_Info.attributes].type = ORDINAL ;

        else if (!strcmp(CONTINUOUS_STRING, attr_desc[1] ) )

        Domain_Info.Datadict[Domain_Info.attributes].type = CONTINUOUS ;

        else {

        fprintf(stderr, "ERROR in %s: invalid domain type %s.\n",

            Datadict_filename, attr_desc[1]) ;

        exit(-1);

    }   

    /* Determine this attributes domain membership */

    if (Domain_Info.Datadict[Domain_Info.attributes].type != NOMINAL) {

        if (sscanf(attr_desc[2], "%f", &min) != 1) {

        fprintf(stderr, "ERROR in %s: invalid min value [%s].\n",

            Datadict_filename, attr_desc[2] ) ;

        exit(-1) ;

        }

        else

        Domain_Info.Datadict[Domain_Info.attributes].min = min ;

       

        if (sscanf(attr_desc[3], "%f", &max) != 1) {

        fprintf(stderr, "ERROR in %s: invalid max value [%s].\n",

            Datadict_filename, attr_desc[2]) ;

        exit(-1) ;

        }

        else

        Domain_Info.Datadict[Domain_Info.attributes].max=max ;

        /* if (debug) fprintf(stderr, "debug: min[%f], max[%f]\n",

                     Domain_Info.Datadict[Domain_Info.attributes].min,

                     Domain_Info.Datadict[Domain_Info.attributes].max); */

        }

   

    else {

        for (i=2 ; i<tokens ; i++) {

        strcpy(Domain_Info.Datadict[Domain_Info.attributes].members[i-2],

               attr_desc[i]) ;

       

        if (debug) fprintf(stderr, "[%s] ",

                    Domain_Info.Datadict[Domain_Info.attributes].members[i-2]) ;

           

        }

        Domain_Info.Datadict[Domain_Info.attributes].size = i-2 ;

    }

    if (debug) fprintf(stderr, "\ndebug: Datadict entry[%d] type[%d] size[%d] name[%s]\n",

               Domain_Info.attributes,

               Domain_Info.Datadict[Domain_Info.attributes].type,

               Domain_Info.Datadict[Domain_Info.attributes].size,

               Domain_Info.Datadict[Domain_Info.attributes].name

               );

   

    /* Proceed to the next data dictionary item */

    fgets(datadict_string, STRING, datadict_file) ;

    Domain_Info.attributes++ ;

    if (Domain_Info.attributes > MAX_ATTRIBUTES) {

        fprintf(stderr,

            "ERROR: number of attributes is greater than MAX [%d]\n",

            MAX_ATTRIBUTES);

        exit(1) ;

    }

   

    }

    /*******************************************************/

    /* LOCATE THE RESPONSE VARIABLE IN THE DATA DICTIONARY */

    /*******************************************************/

    Domain_Info.resp_attr = -1 ; /* default is out of bounds */

    for (i=0; i<Domain_Info.attributes; i++) {

    /* if (debug) fprintf(stderr, "[%s]^[%s]\n", Response_Attribute,

        Domain_Info.Datadict[i].name) ; */

   

    if(!strcmp(Response_Attribute,

           Domain_Info.Datadict[i].name))

        Domain_Info.resp_attr=i ;

    }

    if (Domain_Info.resp_attr == -1) {

    fprintf(stderr, "ERROR: The response attribute [%s] ",

        Response_Attribute);

    fprintf(stderr, "was not found in the data dictionary\n");

    exit(-1);

    }

    else if (Domain_Info.Datadict[Domain_Info.resp_attr].type != NOMINAL) {

    fprintf(stderr, "ERROR: The response attribute [%s] ",

        Response_Attribute);

    fprintf(stderr, "must be of type [%s]\n", NOMINAL_STRING);

    exit(-1);

    }

    else if(debug) {

    fprintf(stderr,

        "debug: response attr found. Datadict[%d] = [%s]\n",

        Domain_Info.resp_attr,

        Domain_Info.Datadict[Domain_Info.resp_attr].name) ;

    }

    /***********************************************

    ** INITIALIZE THE DATABASE COMMUNICATION CHANNEL

    ***********************************************/

    if (!init_db(&DB_ptr, DB_Table_Name, Domain_Info)) {

    fprintf(stderr, "ERROR: Unable to initialize the database\n");

    exit(-1);

    }

    if (debug) fprintf(stderr, "debug:  Domain=[%d]\n",

               Domain_Info.Datadict[Domain_Info.resp_attr].size) ;

   

    if (Domain_Info.Datadict[Domain_Info.resp_attr].size > MAX_DOMAIN_SIZE) {

    fprintf(stderr,

    "ERROR: The number of distinct values in the response attribute %s",

        Response_Attribute) ;

    fprintf(stderr,

    "\t[%d] is greater than the current maximum allowed [%d].\n",

        Domain_Info.Datadict[Domain_Info.resp_attr].size,

        MAX_ATTRIBUTES) ;

    exit(-1) ;

    }

    /***********************************************

    ** BEGIN PROCESSING EVENT VECTORS

    ***********************************************/

    time_1=time(NULL) ;

    fgets(test_string, STRING, test_file);

    while (!feof(test_file)) {

    attr_val event_vector[MAX_ATTRIBUTES];

    char     *token, *str_ptr ;

        if ((str_ptr=strstr(test_string,".\n"))!=NULL) *str_ptr = (char)NULL ;

        if ((str_ptr=strrchr(test_string,'\n'))!=NULL) *str_ptr = (char)NULL ;

    if (verbose) {

        fprintf(output_file,

            "\n***********************************************************\n");

        fprintf(output_file, "Make a prediction for test event #%d\n\n",

            ++test_counter);

    }   

    if(debug) fprintf(stderr, "debug: process next test event [%s]\ndebug: ",

              test_string) ;

    for (token = strtok(test_string,","), i=0;

         token != NULL ;

         token = strtok(NULL, ","), i++ )

    {

        if (Domain_Info.Datadict[i].type == NOMINAL) {

        int member, j;

       

        /* get rid of any initial spaces */

        while(*token==' ') token++ ;

        if (!strcmp(token, "?"))

            member=NOMINAL_NA ;

        else {

            /* BUG cannot handle any padded whitespaces */

            for (j=0, member=NOMINAL_NA-1;

             j<Domain_Info.Datadict[i].size;

             j++)

            if (!strcmp(token, Domain_Info.Datadict[i].members[j]))

                member=j ;

       

            if (member==NOMINAL_NA-1) {

            fprintf(stderr, "ERROR in test file: [%s] ", token) ;

            fprintf(stderr, "is not a member of attr [%s]\n",

                Domain_Info.Datadict[i].name );

            exit(-1);

            }

        }

        event_vector[i].nominal = member ;

       

        if (debug) fprintf(stderr, "e_v[%d][%s]->%d ",

                   i, token, member) ;

        }

        else {

        float value ;

       

        /* get rid of any initial spaces */

        while(*token==' ') token++ ;

        if (!strcmp(token, "?"))

            value=CONTINUOUS_NA ;

       

        else {

            if (sscanf(token, "%f", &value) != 1) {

            fprintf(stderr, "ERROR in test entry [%s]\n", test_string) ;

            fprintf(stderr, "\t[%s] for attr [%s] is not numerical.\n",

                token, Domain_Info.Datadict[i].name );

            exit(-1);

            }

            if (value>Domain_Info.Datadict[i].max) {

            fprintf(stderr, "ERROR in test entry %s:\n",test_string) ;

            fprintf(stderr,

                           "\t%f for attr %sis larger than the domain max %f.\n",

                value, Domain_Info.Datadict[i].name,

                Domain_Info.Datadict[i].max);

            exit(-1);

            }

            else if (value<Domain_Info.Datadict[i].min) {

            fprintf(stderr, "ERROR in test entry %s:\n",test_string) ;

            fprintf(stderr,

                           "\t%f for attr %sis smaller than the domain min %f.\n",

                value, Domain_Info.Datadict[i].name,

                Domain_Info.Datadict[i].min);

            exit(-1);

            }

        }

        if (debug) fprintf(stderr, "e_v[%d][%s]->%f ",

                   i, token, value) ;

       

        event_vector[i].continuous = value ;

        }

    } /* end of foreach attribute-value */

       

        /* complete the debug line */

    if (debug) fprintf(stderr, "\n") ;

    /*******************************************************

    **  DISCOVER THE PREDICTED RULE FOR THIS EVENT VECTOR **

        *******************************************************/

    {

        int    i ;

       

        rule_type Null_Rule ;      /* The prediction if no info given */

        rule_type Max_Test_Rule ;

        rule_type Test_Rule ;

    /*****************************************

    ** DO A TOP DOWN SEARCH UNTIL LOCAL MAXIMA

    *****************************************/

        if(debug) fprintf(stderr,

                "debug: PREDICT (Top Down Search Until Local Maxima)\n") ;

        /* Get and keep the overall distribution for future usage*/

        bzero(&Null_Rule, sizeof(rule_type)) ;

        clear_rule(Null_Rule.db_priv) ;

        create_rule(DB_ptr, &Domain_Info, &Null_Rule);

        /* clear_rule(Null_Rule.db_priv) ;*/

       

        Domain_Info.N = Null_Rule.cover ;

        /* Start the Max_Rule out as the Null_Rule */

        bcopy(&Null_Rule, &Max_Rule, sizeof(struct classifier_str) ) ;

        /* while specializations are possible */

        bzero(&Max_Test_Rule, sizeof(rule_type)) ;

        Max_Test_Rule.value = 1;

        while (Max_Test_Rule.value > min_measure ) {

        if(debug)fprintf(stderr,"debug: perform a round of specialization\n");

        Max_Test_Rule.value = 0  ;

        /* try to specialize on every given attribute-value */

        /* BUG: this should be a random selection so that ties */

                /*      do not necessarily go the the first attribute */

        for (i = 0; i < Domain_Info.attributes; i++) {

            /* is this a candidate term for conjunction? */

            if (     i != Domain_Info.resp_attr

                 && event_vector[i].nominal != NOMINAL_NA

                 && event_vector[i].continuous != CONTINUOUS_NA  ) {

            bcopy(&Max_Rule, &Test_Rule, sizeof(rule_type)) ;

            /* update the size of the antecedent if prev unused */

            if (!Test_Rule.attribute_map[i]) Test_Rule.a_size++ ;

            if (Domain_Info.Datadict[i].type == NOMINAL ) {

               

                Test_Rule.Terms[i].nominal = event_vector[i].nominal ;

                            /* if (debug) fprintf(stderr, "assigning %d to term %d\n",

                   t_member, i); */

            }

            else {

                float min, max, rmin, rmax ;

                float delta ;

               

                /* if this is the first specialization on this

                   non-nominal attribute then use the domain bounds */

                if (!Test_Rule.attribute_map[i])

                delta = Domain_Info.Datadict[i].max -

                    Domain_Info.Datadict[i].min ;

                else

                delta = Test_Rule.Terms[i].min_max[1] -

                    Test_Rule.Terms[i].min_max[0] ;

                /* prune the range accordingly*/

                delta /= num_part_ratio ;

               

                delta /= 2.0 ; /* half on each side */   

               

                min = rmin = event_vector[i].continuous - delta ;

                max = rmax = event_vector[i].continuous + delta ;

               

                /* Round off ordinal types */

                if (Domain_Info.Datadict[i].type == ORDINAL ) {

                min = (float)floor(rmin+0.49999) ;

                max = (float)floor(rmax+0.49999) ;

                }

                Test_Rule.Terms[i].min_max[MIN]=min ;

                Test_Rule.Terms[i].min_max[MAX]=max ;

               

            } /* end of term composition */

           

            /* signal that this attribute has been used */

            Test_Rule.attribute_map[i]++ ;

           

            /* update the text Rule */

            if (verbose) set_rule_description(&Test_Rule, Domain_Info) ;

            /*****************************************************/

            /* call SPECIALIZE_RULE()                            */

            /* get the rule's cover and consequent from the data */

            /* create_rule(DB_ptr, &Domain_Info, &Test_Rule); */

                        specialize_rule(DB_ptr, &Domain_Info, &Test_Rule, i);

            /*****************************************************/

            /* call EVALUATION_FUNCTION()                        */

                        /* get the quality of this rule */

            if (Test_Rule.cover >= min_cover) {

                eval_fn(&Test_Rule, &Max_Rule, Domain_Info,

                    eval_function) ;

            }           

            else {

                Test_Rule.value = 0 ;

            }

           

           

            /* is this test rule better than the current best test */

            if (Max_Test_Rule.value < Test_Rule.value) {

                if (debug) fprintf(stderr,

                           "NEW WINS *** %f < %f\tcover %d\n",

                           Max_Test_Rule.value, Test_Rule.value,

                           Test_Rule.cover);

                clear_rule(Max_Test_Rule.db_priv) ;

                bcopy(&Test_Rule, &Max_Test_Rule,

                  sizeof(rule_type) ) ;

            }

            else {

                clear_rule(Test_Rule.db_priv) ;

                if (debug) fprintf(stderr,

                    "old max_test stays %f < %f\tcover %d\n",

                    Max_Test_Rule.value, Test_Rule.value,

                    Test_Rule.cover);

            }

            }

           

            else { /* may want to find out why this term was rejected */

            if (debug) fprintf(stderr,

               "debug: addition of term [%d] to rule [%s] FAILED.\n",

               i, Max_Rule.Rule) ;

            }

           

        } /* go back and try any other potential term to specialize on */

       

        if(debug)fprintf(stderr,"debug: end round of specialization\n");

       

        /* Have we really specialized any further? */

        if (Max_Test_Rule.value > min_measure) {           

            clear_rule(Max_Rule.db_priv);

           

            /* update the max rule to be the winning test rule*/

            bcopy(&Max_Test_Rule, &Max_Rule, sizeof(rule_type) ) ;

            bzero(&Max_Test_Rule, sizeof(rule_type)) ;

            Max_Test_Rule.value = 1;

            if (debug) {

            fprintf(stderr,

                          "NEW BEST RULE: COVER: %d\tValue %f\t\n",

                   Max_Rule.cover, Max_Rule.value) ;

            print_rule(output_file, Domain_Info, &Max_Rule);

           

            test_rule(output_file, &Domain_Info, &Max_Rule,

                               Domain_Info.Datadict[Domain_Info.resp_attr].members[(int)event_vector[Domain_Info.resp_attr].nominal],

                               test_accuracy);

            }

        }

       

        } /* while specializations are possible) */

       

        if(debug)fprintf(stderr,"debug: Max Rule found - cover %d !> %d\n",

                 Max_Rule.cover,  Max_Test_Rule.cover) ;

       

       

    } /* end of predictions section */

   

   

    /***********************************************************/

    /** PROCESS THE PREDICTED (OR SELECTED) RULE AS REQUESTED **/

    /***********************************************************/

    if(verbose)

        print_rule(output_file, Domain_Info, &Max_Rule);

    /* Report whether this prediction was correct to the appropriate degree

       given that the correct value has been provided in the event vector */

    if(test_accuracy && event_vector[Domain_Info.resp_attr].nominal!=NOMINAL_NA ) {

        int resp_attr=Domain_Info.resp_attr ;

        int member ;

       

        member = event_vector[resp_attr].nominal ;

       

        if (debug) fprintf(stderr, "%d\t%d\t[%s]\n", resp_attr, member,

            Domain_Info.Datadict[resp_attr].members[member]) ;

       

        test_rule(output_file, &Domain_Info, &Max_Rule,

              Domain_Info.Datadict[resp_attr].members[member],

              test_accuracy);

    }     

        /* release this rule */   

    clear_rule(Max_Rule.db_priv) ;

       

    /* prepare for the next test event */

    fgets(test_string, STRING, test_file);

   

    } /* while there are more test events*/

   

    /***********************************************/

    /** FINISH                                    **/

    /***********************************************/

    if (debug) fprintf(stderr, "debug: about to FINISH\n") ;

    db_exit(DB_ptr) ;

    if (output_file != stdout) fclose(output_file) ;

    exit(0) ;

} /* EXIT main() */

/*****************************************************************

******************************************************************

** SUPPORT FUNCTIONS                         **

******************************************************************

*****************************************************************/

/*****************************************************************************

** Class Separation Distance Measure

** ORT  measure proposed in Fayyad,Irani[AAAI-92]

** This is the cosine of the angle between the two distributions

*****************************************************************************/

int    eval_fn(

    rule_type    *TR ,    /* Test Rule */

    rule_type    *MR ,    /* Max Rule */

    domain_type     DI ,    /* Domain Info */

    int              measure

)

{

    int i ;

    int classes = DI.Datadict[DI.resp_attr].size;

                

    double TR_cover=TR->cover ;

    double MR_cover=MR->cover ;

    double TRC_cover ;

    double TRC_ratio=0 ;

    double TR_ratio ;     /* store into variable for efficiency */

    double MR_ratio ;     /* store into variable for efficiency */

    double sum_abab = 0 ; /* Sum along dimensions of squares of (TR*TRC)'s */

    double sum_ab2 = 0 ;  /* Sum along dimensions of squares of (TR-TRC)'s */

    double sum_a2 = 0 ;   /* Sum along dimensions of squares of TR's */

    double sum_b2 = 0 ;   /* Sum along dimensions of squares of TRC's */

    /* for entropy calc */

    /* double log10_2 = log10(2) ;  hardcode to calc base2 logs */

    double entropy_p = 0 ;

    double entropy_l = 0 ;

    double entropy_r = 0 ;

    double entr_norm = 0 ; /* entropy normalization quotient */

    /* does this rule (or its complement) have a cover? */

    if ( (TR_cover==0) || (MR_cover-TR_cover==0) ) {

    TR->value = (float)0 ;

    if (debug) fprintf(stderr, "debug: Everything matched!\n") ;

    return(0) ;

    }

    /* get the normalization quotient for this entropy measure version */

    if (measure==6) {

    /* foreach class ratio */

    for (i=0; i<classes; i++) {

        /* if the ratio is zero then add zero (nothing) */

        if (TR->conseq_ratios[i] > 0.00001)

        entr_norm += TR->conseq_ratios[i]/MR->conseq_ratios[i] ;

    }

    }

   

    /* foreach class value accumulate the differences */

    for (i=0; i<classes; i++) {

    TR_ratio = TR->conseq_ratios[i] ; /* make local for efficiency */

    MR_ratio = MR->conseq_ratios[i] ; /* make local for efficiency */

    /* Set the TRC_ratio based on the parents and sister's distributions*/

    TRC_cover  = MR_ratio*MR_cover - TR_ratio*TR_cover ;

    TRC_ratio  = TRC_cover / (MR_cover - TR_cover) ;

    if (debug) fprintf(stderr, "%f %f    %f %f    %f %f\n",

        MR->conseq_ratios[i], MR->conseq_ratios[i]*MR->cover,

        TR_ratio, TR_ratio*TR_cover,

           TRC_ratio, TRC_cover ) ;

    /* for Cos() */

    if (measure==1) {

        sum_abab += ( TR_ratio  * TRC_ratio ) ;

        sum_a2   += ( TR_ratio  * TR_ratio  ) ;

        sum_b2   += ( TRC_ratio * TRC_ratio ) ;

        }

    /* for Cos2() */

    if (measure==4) {

        sum_abab += ( TR_ratio  * MR_ratio  ) ;

        sum_a2   += ( TR_ratio  * TR_ratio  ) ;

        sum_b2   += ( MR_ratio  * MR_ratio  ) ;

        }

    /* for Distance() */

    if (measure==2) {

        sum_ab2 += (TR_ratio - TRC_ratio)*(TR_ratio - TRC_ratio);

        /* fprintf(stderr,

            "%g <-+ (%g - %g)^2\n", sum_ab2, TR_ratio, TRC_ratio);

    */

    }

        /* for Distance2() */

    if (measure==5) {

        sum_ab2 += (TR_ratio - MR_ratio)*(TR_ratio - MR_ratio);

    }

   

    /* for Entropy() */

    if (measure==3) { /* test for small so that 0*log(0) = 0 */

        if (MR_ratio > 0.00001)

        entropy_p -= MR->conseq_ratios[i] * log10(MR->conseq_ratios[i])

            / LOG10_2 ;

        if (TR_ratio > 0.00001)

        entropy_r -= TR_ratio  * log10(TR_ratio)/LOG10_2 ;

        if (TRC_ratio > 0.00001)

        entropy_l -= TRC_ratio * log10(TRC_ratio)/LOG10_2 ;

       

        /* fprintf(stderr, "_p %g , _r %g, _l %g\n",

        entropy_p, entropy_r, entropy_l); */

     }

    /* for Entropy() */

    if (measure==6) { /* test for small so that 0*log(0) = 0 */

        double norm_ratio ;

        if (MR_ratio >= 0) { /* i.e. always */

                /* normalized parent's ratio = 1/classes by definition */

        norm_ratio = 1.0/classes  ;

        entropy_p -= norm_ratio * log10(norm_ratio)/LOG10_2 ;

        }

   

        if (TR_ratio > 0.00001) {

        norm_ratio = (TR_ratio/MR_ratio) / entr_norm  ;

       

        entropy_r -= norm_ratio * log10(norm_ratio)/LOG10_2 ;

        }

     }

    /* printf("") ; */

    }

    /** Cos() based i.e. ORT */

    if ((measure==1) || (measure==4)) {

    /** calculate the cosine of the angle between the two distr. vectors */

    double cos_angle = sum_abab / (sqrt(sum_a2)*sqrt(sum_b2));

    TR->value = 1 - cos_angle ;

    if (debug) fprintf(stderr, "%g <- 1 - (%g / (sqrt(%g) * sqrt(%g)))\n",

        TR->value, sum_abab, sum_a2, sum_b2 ) ;

    }

    /** Euclidean distance() based */

    if ((measure == 2) || (measure == 5)) {

    /** calc distance between the two vectors */

    /** normalize [0,sqrt(2)] to [0,1]  */

    TR->value = sqrt(sum_ab2)/SQRT_2 ;

    if (debug) fprintf(stderr, "%f <- sqrt(%g)/%g\n",

        TR->value, sum_ab2, sqrt(2) ) ;

    }

    /** Entropy() based */

    if (measure == 3) {

    /* probability of each path */

    double p_l = (MR_cover - TR_cover)/MR_cover ;

    double p_r = (TR_cover)/MR_cover ;

   

    /** parents entropy minus the portion of the children's entropy */

    TR->value = entropy_p - p_l*entropy_l - p_r*entropy_r ;

    if (debug) fprintf(stderr, "%f <- %f - %g*%g - %g*%g\n",

        TR->value, entropy_p, p_l, entropy_l,  p_r, entropy_r ) ;

    }

    /** Entropy2() based */

    if (measure == 6) {

    /** parents entropy minus the the child's entropy */

    TR->value = entropy_p - entropy_r ;

    /* if (debug) fprintf(stderr, "%g <- %g - %g\n",

        TR->value, entropy_p, entropy_r ) ; */

    }

    if(debug) fprintf(stderr, "heuristic->%f\n", TR->value) ;

    return (0) ;

}

/*****************************************************************************

** print a report of the rule consequent

*****************************************************************************/

void    print_rule(

    FILE        *fd ,

    domain_type    DI ,   /* Domain_Info */

    rule_type    *Rule

    )

{

    struct test  TEST[MAX_ATTRIBUTES]; /* Ratios repository to be sorted */

    int          classes = DI.Datadict[DI.resp_attr].size ;

    char     Quantity[13] ;

    char      Ratio[13] ;

    int         i ;

    if (debug) fprintf(stderr,"debug: enter print_rule()\n") ;

    /* prepare the test table */

    bzero(&TEST, sizeof(TEST)) ;

    for(i=0; i<classes; i++) {

    TEST[i].index = i ;

    TEST[i].ratio = Rule->conseq_ratios[i] ;

    }

    /* qsort into descending order */

    qsort (&TEST, classes, sizeof(struct test), ratio_compare) ;

    /* print the banner */

    if (Rule->Rule != NULL)

    fprintf(fd, "BECAUSE\n%s\n", Rule->Rule );

    else

        fprintf(fd, "%s\n", NULL_RULE_DESC) ;

    fprintf(fd, "PREDICT\n");

    fprintf(fd, "%-25s\t%-13s\t%-13s\n",

        DI.Datadict[DI.resp_attr].name,

        "Quantity", "Ratio" );

    fprintf(fd, "%-25s\t%-13s\t%-13s\n", "-------------------------", "-------------", "-------------" );

    /* print out the consequent */

    for (i=0; i<classes; i++) {

        /* get the quantity and ratio for this class */

    sprintf(Quantity, "%d", /* round this number */

        (int)floor(0.5 + (Rule->cover * TEST[i].ratio) ) ) ;

    sprintf(Ratio, "%f", TEST[i].ratio ) ;

    fprintf(fd, "%-25s\t%-13s\t%-13s\n",

        DI.Datadict[DI.resp_attr].members[TEST[i].index],

        Quantity, Ratio ) ;

    }

}

/*****************************************************************************

** print the correctness of the rule consequent

** BUG: What to return when equally probable predictions arise (e.g. 50:50:0)?

*****************************************************************************/

void test_rule(

    FILE            *fd ,

    domain_type    *DI ,

    rule_type    *Rule ,

    char        *True_Response ,

    int        within  /* the placement of the correct class in */

                    /* the consequent must be within this value. */

    )

{

    struct test TEST[MAX_ATTRIBUTES]; /* Ratios repository to be sorted */

    int  found_at=MAX_ATTRIBUTES+1 ;

    int  i ;

    char result[TOKEN] ;

    int  classes = DI->Datadict[DI->resp_attr].size ;

    if (debug) fprintf(stderr,"debug: enter test_rule()\n") ;

    /* prepare the test table */

    bzero(&TEST, sizeof(TEST)) ;

    for(i=0; i<classes; i++) {

    TEST[i].index = i ;

    TEST[i].ratio = Rule->conseq_ratios[i] ;

    }

    /* qsort into descending order */

    qsort (&TEST, classes, sizeof(struct test), ratio_compare) ;

    /* process the result */

    for(i=0; i<classes; i++) {

    if (debug) fprintf(stderr,"debug: [%s]^[%s]",

        DI->Datadict[DI->resp_attr].members[TEST[i].index],

               True_Response) ;      /* BUG */

    if ( !strcmp(DI->Datadict[DI->resp_attr].members[TEST[i].index],

             True_Response) ) {

        found_at=i ;

        if (debug) fprintf(stderr, "\tcorrect match\n" ) ;

    }

    else

        if (debug) fprintf(stderr, "\tdon't match\n" ) ;

    }

    if (found_at<within) strcpy(result, CORRECT_STRING) ;

    else strcpy(result, WRONG_STRING) ;

   

    if (verbose) {   

    fprintf(fd, "\n%8s %20s %8s %20s\n",

        "Result", "Antecedent Size", "Cover", "True Response");

    fprintf(fd, "%8s %20s %8s %20s\n", "--------",

        "--------------------", "--------", "--------------------") ;

    fprintf(fd, "%8s %20d %8d %20s\n", result, Rule->a_size, Rule->cover,

        True_Response ) ;

    }

    else {

    time_2=time(NULL) ;

    fprintf(fd, "%s\t%d\t%d\t%d\n", result, Rule->a_size, Rule->cover, (int)(time_2-time_1)) ;

    time_1=time_2 ;

    }

    fflush(fd) ;

   

}

/*****************************************************************************

** used as the sorting function in qsort()

*****************************************************************************/

static int ratio_compare(

    const void *i_ ,

    const void *j_

)

{

    struct test *i=(struct test *)i_ ;

    struct test *j=(struct test *)j_ ;

    int value ;

    value = 9999999 * (j->ratio - i->ratio);

    if (debug) fprintf(stderr, "debug: %f\t%f -> %d\n",

               j->ratio, i->ratio, value) ;

    return (value);

}

static void set_rule_description (

    rule_type *Rule,

    domain_type DI

    )

{

    int i;

    char term[TOKEN];

    int first=0;

   

    for(i=0; i<DI.attributes; i++) {

    if (Rule->attribute_map[i]) {

        char *attribute=DI.Datadict[i].name ;       

       

        if (first++) /* if first then clean out o/wise slap an AND */

        strcat(Rule->Rule, " AND ") ;

        else

        strcpy(Rule->Rule, "") ;

        if (DI.Datadict[i].type == NOMINAL) {

#if DBMS==1

           /* SQL FORM */

        sprintf(term, "%s=[%s]", attribute,

            DI.Datadict[i].members[(int)Rule->Terms[i].nominal]) ;

#else

        sprintf(term, "%s=\"%s\"", attribute,

            DI.Datadict[i].members[(int)Rule->Terms[i].nominal]) ;

#endif

        }

        else {

#if DBMS==1

        sprintf(term, "%s IN [%g, %g]" ,

            attribute , Rule->Terms[i].min_max[MIN] ,

            Rule->Terms[i].min_max[MAX] ) ;

#else

        /* SQL FORM */

        sprintf(term, "%s>=%g AND %s<=%g" ,

            attribute , Rule->Terms[i].min_max[MIN] ,

            attribute , Rule->Terms[i].min_max[MAX] ) ;

#endif

        }

        strcat(Rule->Rule, term) ;

    }

    }

}