DBPredictor

http://www.cs.sfu.ca/~melli/DBPredictor/

/*********************************************************************

**********************************************************************

** DEFINED DEFAULTS

**********************************************************************

*********************************************************************/

#define MAX_ATTRIBUTES 81

#define TOKEN 256

#define STRING 2056

#define MAX_DOMAIN_SIZE 64

#define NOMINAL_STRING "nominal"

#define ORDINAL_STRING "ordinal"

#define CONTINUOUS_STRING "continuous"

#define NOMINAL 0

#define ORDINAL 1

#define CONTINUOUS 2

#define MAX_ROWS 200000 /* for in memory (signed short int) */

#define NOMINAL_NA -1

#define CONTINUOUS_NA -999 /* as per C4.5 */

#define WRONG_STRING "WRONG"

#define CORRECT_STRING "CORRECT"

#define MIN 0

#define MAX 1

/*********************************************************************

**********************************************************************

** GLOBAL STRUCTURES

**********************************************************************

*********************************************************************/

typedef union

{

char nominal ; /* id of the attribute member */

float continuous ;

} attr_val ;

typedef union

{

char nominal ; /* id of the attribute member */

float min_max[2] ;

} term ;

/* Rule structure */

typedef struct classifier_str {

term Terms[MAX_ATTRIBUTES] ;

char attribute_map[MAX_ATTRIBUTES]; /* degree of conj on this attr */

char antecedent_order[MAX_ATTRIBUTES]; /* order of specialization for this rule */

int a_size ; /* number of conjunctions (ANDs) in this rule */

float value ; /* rule quality value */

int cover ; /* rule cover BUG - change name to cover */

char Rule[STRING] ; /* text based rule (SQL WHERE clause) */

float conseq_ratios[MAX_DOMAIN_SIZE] ;

void *db_priv ; /* pointer available for the private use

of the db functions. e.g. to pass info

from create_rule to specialize_rule. */

} rule_type ;

typedef struct attribute_str

{

char name[TOKEN];

short type ;

short size ;

/* BUG: a union would be more efficient for these two */

/* BUG: domain members should be in alpha ascending order */

char members[MAX_DOMAIN_SIZE][TOKEN] ; /* for nominal */

float min ; /* for ordinal and continuous */

float max ; /* for ordinal and continuous */

} attribute_type ;

typedef struct domain_str {

short attributes ; /* number of attribute in DB_Table */

short resp_attr ;

attribute_type Datadict[MAX_ATTRIBUTES] ;

long N ; /* number of records in DB_Table */

/* char Response_Attribute[MAX_STRING] ;

char Resp_Attr_Dom[MAX_ATTRIBUTES][MAX_STRING] ;

short resp_attr_dom_size ; */

} domain_type ;

/*********************************************************************

**********************************************************************

** FORWARD DECLARATION

**********************************************************************

*********************************************************************/

extern int init_db (

void **, char *, domain_type

) ;

extern int create_rule (

void *, domain_type *, rule_type *

) ;

extern int specialize_rule (

void *, domain_type *, rule_type *, int

) ;

extern int clear_rule (

void *

) ;

extern int db_exit (

void *

) ;

/* make this flag available to the other objects */

extern int debug ;

////////////////////////////////////////////////////////////////////////////////////////////

#define AUTHOR "Gabor_Melli@cs.sfu.ca"

#define DATE "1998/03/28" /* string */

#define VERSION 2.0 /* real number */

/*****************************************************************

** Attribute-Value Prediction (DBPredictor) program **

** **

** School of Computing Science / Simon Fraser University **

** **

** This code is freely available for personal or academic use. **

** Express written consent from the author is required to use **

** this program in a commercial setting. **

** **

** Comments and suggestions are very welcome! **

** **

** for further information please refer to **

** http://www.cs.sfu.ca/~melli/DBPredictor **

** **

******************************************************************

** **

** EXIT STATUS: **

** 0: successful **

** -1: unsuccessful **

** **

** NEW **

** **

** TO DO: http://.../DBPredictor/to_do.html **

** **

** QUESTIONS: http://.../DBPredictor/questions.html **

** **

** BUGS: **

** **

*****************************************************************/

/*****************************************************************

** PROGRAM STRUCTURE **

** **

** INCLUDE FILES **

** DEFINED DEFAULTS **

** SYNTAX MESSAGE **

** GLOBAL STRUCTURES **

** GLOBAL VARIABLES **

** FORWARD DECLARATION **

** MAIN **

** - Process the Program Parameters **

** - Test the Selected Parameter Settings **

** - Report of the Variable Settings (when verbose) **

** - PEFORM PREDICTION **

** **

** SUPPORT PROCEDURES **

** init_db() **

** create_overall() **

** create_rule() **

** print_rule() **

** test_rule() **

** C_SEP_measure() **

** ratio_compare() **

** err_handler() **

** msg_handler() **

*****************************************************************/

/*****************************************************************

******************************************************************

** INCLUDE FILES **

******************************************************************

*****************************************************************/

#include <stdio.h>

#include <math.h> /* sqrt(), ceil() */

#include <stdlib.h> /* atoi() */

#include <string.h> /* bcopy(), bzero() */

#include <unistd.h> /* getopt() */

#include <time.h> /* time() */

#include "dbpredictor.h" /* typedefs and globals */

/*********************************************************************

**********************************************************************

** LOCAL DEFINITIONS

**********************************************************************

*********************************************************************/

/* determined through empirical study */

#define NUM_PART_RATIO 2.0 /* def. numerical partition ratio */

#define MIN_COVER 1 /* def. smallest rule support considered */

#define MIN_MEASURE 0.0 /* def. smallest interestingness considered */

#define DEF_EVAL_FN 5 /* def. evaluation function */

#define LOG10_2 0.301029996

#define SQRT_2 1.414213562

#define NULL_RULE_DESC "Overall Distribution"

/*********************************************************************

**********************************************************************

** GLOBAL STRUCTURES AND VARIABLES

**********************************************************************

*********************************************************************/

struct test { /* used to temporarily sort the consequent distribution */

int index ;

double ratio ;

} ;

int debug=0;

static int verbose=0;

static time_t time_1,time_2 ; /* help to report duration between preds */

/*********************************************************************

**********************************************************************

** SYNTAX MESSAGE

**********************************************************************

*********************************************************************/

#define USAGE \

fprintf(stderr, "\nSYNTAX: %s [-bdhisv] -tDFR option\n\n", argv[0]) ; \

fprintf(stderr, "\td:\tdebug msgs to stderr\n") ; \

fprintf(stderr, "\th:\thelp\n") ; \

fprintf(stderr, "\ti:\tinteractive (one at a time vs batch)\n") ; \

fprintf(stderr, "\tv:\tverbose\n") ; \

fprintf(stderr, "\n") ; \

fprintf(stderr, "\tt:\tX (report accuracy within X)\n") ; \

fprintf(stderr, "\tD:\tDatabase Table\n") ; \

fprintf(stderr, "\tf:\tFilestem\n") ; \

fprintf(stderr, "\tR:\tResponse attribute to predict\n") ; \

fprintf(stderr, "\te:\tevaluation function [%d]\n", DEF_EVAL_FN) ; \

fprintf(stderr, "\tm:\tmin heuristic measure [%g]\n", MIN_MEASURE) ; \

fprintf(stderr, "\tc:\tmin cover [%d]\n", MIN_COVER) ; \

fprintf(stderr, "\tn:\tnumerical attr div ratio [%g]\n", NUM_PART_RATIO) ; \

fprintf(stderr, "\n") ; \

fprintf(stderr, "\tExample\n") ; \

fprintf(stderr, "\t%s -f iris -Rclass -v\n",argv[0]) ; \

fprintf(stderr, "\n") ;

/*********************************************************************

**********************************************************************

** FORWARD DECLARATION

**********************************************************************

*********************************************************************/

extern void bzero(void *, int) ;

extern void bcopy (const void *, void *, int) ;

extern int getopt(int, char * const [], const char *) ;

extern char *strtok(char *, const char *) ;

static void test_rule(

FILE *, struct domain_str *, struct classifier_str *, char *, int ) ;

static void print_rule(

FILE *, struct domain_str, struct classifier_str * ) ;

static int eval_fn(

struct classifier_str *, struct classifier_str *, struct domain_str, int ) ;

/* static int ratio_compare (struct test *, struct test * ) ; */

static int ratio_compare (const void *, const void *) ;

static void set_rule_description (rule_type *, domain_type) ;

/*****************************************************************

******************************************************************

** MAIN **

******************************************************************

*****************************************************************/

int main(int argc, char *argv[])

{

FILE *datadict_file=NULL ;

FILE *test_file=NULL ;

FILE *output_file=NULL;

char Filestem[100] ;

char Datadict_filename[256] ;

char Output_filename[256] ;

char Tests_filename[256] ;

char DB_Table_Name[256] ;

char Response_Attribute[TOKEN] = {0} ;

int min_cover = MIN_COVER ;

float min_measure = MIN_MEASURE ;

float num_part_ratio = NUM_PART_RATIO ;

int eval_function = DEF_EVAL_FN ;

int i ; /* counter */

int interactive=1 ;

char datadict_string[STRING] ;

char test_string[STRING] ; /* holds event vector values */

char test_accuracy=1 ; /* default: best class accuracy */

void *DB_ptr;

int test_counter=0 ;

domain_type Domain_Info ;

rule_type Max_Rule ;

/***********************************************

** INITIALIZE

***********************************************/

DB_Table_Name[0] = (char)NULL ;

Filestem[0] = (char)NULL ;

Output_filename[0] = (char)NULL ;

Response_Attribute[0] = (char)NULL ;

/***********************************************

** DECOMPOSE THE PROGRAM ARGUMENTS

***********************************************/

{

char c ; /* parameter flag character */

extern char *optarg ;

while ((c = getopt(argc, argv, "dhisvt:f:D:R:n:c:m:e:")) != -1)

switch(c)

{

case 'd': /* Debug */

debug=1 ;

verbose=1 ;

break ;

case 'h': /* Simple help message */

USAGE ;

exit(-1) ;

case 'i': /* Interactive instead of batch */

interactive=1 ;

break ;

case 't': /* Test accuracy instead of report rules */

test_accuracy=atoi(optarg) ;

break ;

case 'v': /* Verbose */

verbose=1 ;

break ;

case 'f': /* file prefix */

strcpy(Filestem, optarg) ;

break ;

case 'D': /* Database table */

strcpy(DB_Table_Name, optarg) ;

break ;

case 'R': /* Response attribute to predict */

strcpy(Response_Attribute, optarg) ;

break ;

case 'c': /* Minimum cover parameter */

sscanf(optarg, "%d", &min_cover) ;

break ;

case 'm': /* Minimum measure */

sscanf(optarg, "%f", &min_measure) ;

break ;

case 'n': /* Response attribute to predict */

sscanf(optarg, "%f", &num_part_ratio) ;

break ;

case 'e': /* Evaluation function */

if (!strcmp(optarg,"cos"))

eval_function = 1;

else if(!strcmp(optarg,"dist"))

eval_function = 2;

else if(!strcmp(optarg,"entropy"))

eval_function = 3;

else if(!strcmp(optarg,"cos2"))

eval_function = 4;

else if(!strcmp(optarg,"dist2"))

eval_function = 5;

else if(!strcmp(optarg,"entropy2"))

eval_function = 6;

else {

fprintf(stderr,"ERROR: Invalid parameter -e %s\n", optarg);

exit(-1);

}

break ;

}

/* Report the version information in verbose mode */

if (verbose) fprintf(stdout,

"v%4.2f(%s)\n - Min Measure %g\tMin Cover:%d\tNum Prun: %g\tEval Function:%d\n\n",

VERSION, DATE, min_measure, min_cover, num_part_ratio, eval_function );

/*****************************************************/

/* TEST FOR APPROPRIATE VALUES & REQUIRED PARAMETERS */

/*****************************************************/

if (Response_Attribute == (char)NULL) {

fprintf(stderr, "ERROR: Response attribute parameter (-R) not set\n");

exit(-1);

}

if (Filestem[0] == (char)NULL) {

fprintf(stderr, "ERROR: Filestem parameter (-f) not set\n");

exit(-1);

}

/*

if (test_accuracy < 1) {

fprintf(stderr, "ERROR: test accuracy of [%d] is invalid\n",

test_accuracy );

exit(-1);

}

*/

/***********************************************

** SETUP AND TEST I/O FILES

***********************************************/

#if DBMS==1

if (debug) fprintf(stderr, "FILE-based dataset\n") ;

if (DB_Table_Name[0] == (char)NULL) {

sprintf(DB_Table_Name, "%s.data", Filestem) ;

}

#elif DBMS==2

if (debug) fprintf(stderr, "SYBASE DB-LIB based dataset\n") ;

if (DB_Table_Name[0] == (char)NULL) {

sprintf(DB_Table_Name, "%s", Filestem) ;

}

#endif

sprintf(Datadict_filename, "%s.dict", Filestem) ;

if ((datadict_file = fopen(Datadict_filename, "r")) == NULL) {

fprintf(stderr,

"ERROR: Could not open the data dictionary file: [%s]\n",

Datadict_filename);

exit(-1);

}

sprintf(Tests_filename, "%s.test", Filestem) ;

if ((test_file = fopen(Tests_filename, "r")) == NULL) {

fprintf(stderr,

"ERROR: Could not open the tests file: %s\n",

Tests_filename);

exit(-1);

}

if (interactive) { /* interactive sessions pipe the results to stdout */

strcpy(Output_filename, "<stdout>") ;

output_file = stdout ;

}

else {

sprintf(Output_filename, "%s.output", Filestem) ;

if ((output_file = fopen(Output_filename, "w")) == NULL) {

fprintf(stderr, "ERROR: Could not open output file: %s\n",

Output_filename);

exit(-1);

}

/***********************************************

** DEBUG

***********************************************/

if (debug) {

fprintf(stderr, "debug: \n") ;

fprintf(stderr, "debug: This is a selection query\n") ;

if (interactive) fprintf(stderr, "debug: Interactive session\n") ;

else fprintf(stderr, "debug: Batch session\n") ;

if (test_accuracy) fprintf(stderr, "debug: Test accuracy query\n") ;

else fprintf(stderr, "debug: Report rule query\n") ;

fprintf(stderr, "debug: DB_Table_Name = [%s]\n", DB_Table_Name) ;

fprintf(stderr, "debug: Response Attrribute = [%s]\n",

Response_Attribute) ;

fprintf(stderr, "debug: Data Dict. file = [%s]\n",

Datadict_filename) ;

fprintf(stderr, "debug: Tests file = [%s]\n",

Tests_filename) ;

fprintf(stderr, "debug: Output file = [%s]\n",

Output_filename) ;

fprintf(stderr, "debug: \n") ;

}

/***********************************************

** READ IN THE DATA DICTIONARY INFORMATION

***********************************************/

Domain_Info.attributes=0 ;

fgets(datadict_string, STRING, datadict_file) ;

while (!feof(datadict_file)) {

char *token ;

char attr_desc[MAX_ATTRIBUTES][TOKEN] ;

float min ;

float max ;

int tokens, i ;

/* prune the ".\n" ending from the buffer if it exists */

if ((token=strstr(datadict_string,".\n")) != NULL)

*token = (char)NULL ;

if ((token=strstr(datadict_string,"\n")) != NULL)

*token = (char)NULL ;

/* if (debug)fprintf(stderr, "debug: datadict_string %s\n", datadict_string); */

/* Place this attributes tokens into an array */

tokens=0 ;

token = strtok(datadict_string, ":") ;

strcpy(attr_desc[tokens], token) ;

for (token=strtok(NULL, ","), i=1 ;

token != NULL ;

token=strtok(NULL, ","), i++ ) {

/* get rid of any initial spaces */

while(*token==' ' || *token=='\t') token++ ;

strcpy(attr_desc[i], token) ;

if(debug)fprintf(stderr, " [%s]\n", attr_desc[i]);

}

tokens=i ;

if (tokens<3) {

fprintf(stderr, "ERROR in %s at line %d: Information missing.\n",

Datadict_filename, Domain_Info.attributes+1) ;

exit(-1);

}

strcpy(Domain_Info.Datadict[Domain_Info.attributes].name,

attr_desc[0]) ;

if (!strcmp(NOMINAL_STRING, attr_desc[1] ) )

Domain_Info.Datadict[Domain_Info.attributes].type = NOMINAL ;

else if (!strcmp(ORDINAL_STRING, attr_desc[1] ))

Domain_Info.Datadict[Domain_Info.attributes].type = ORDINAL ;

else if (!strcmp(CONTINUOUS_STRING, attr_desc[1] ) )

Domain_Info.Datadict[Domain_Info.attributes].type = CONTINUOUS ;

else {

fprintf(stderr, "ERROR in %s: invalid domain type %s.\n",

Datadict_filename, attr_desc[1]) ;

exit(-1);

}

/* Determine this attributes domain membership */

if (Domain_Info.Datadict[Domain_Info.attributes].type != NOMINAL) {

if (sscanf(attr_desc[2], "%f", &min) != 1) {

fprintf(stderr, "ERROR in %s: invalid min value [%s].\n",

Datadict_filename, attr_desc[2] ) ;

exit(-1) ;

}

else

Domain_Info.Datadict[Domain_Info.attributes].min = min ;

if (sscanf(attr_desc[3], "%f", &max) != 1) {

fprintf(stderr, "ERROR in %s: invalid max value [%s].\n",

Datadict_filename, attr_desc[2]) ;

exit(-1) ;

}

else

Domain_Info.Datadict[Domain_Info.attributes].max=max ;

/* if (debug) fprintf(stderr, "debug: min[%f], max[%f]\n",

Domain_Info.Datadict[Domain_Info.attributes].min,

Domain_Info.Datadict[Domain_Info.attributes].max); */

}

else {

for (i=2 ; i<tokens ; i++) {

strcpy(Domain_Info.Datadict[Domain_Info.attributes].members[i-2],

attr_desc[i]) ;

if (debug) fprintf(stderr, "[%s] ",

Domain_Info.Datadict[Domain_Info.attributes].members[i-2]) ;

}

Domain_Info.Datadict[Domain_Info.attributes].size = i-2 ;

}

if (debug) fprintf(stderr, "\ndebug: Datadict entry[%d] type[%d] size[%d] name[%s]\n",

Domain_Info.attributes,

Domain_Info.Datadict[Domain_Info.attributes].type,

Domain_Info.Datadict[Domain_Info.attributes].size,

Domain_Info.Datadict[Domain_Info.attributes].name

);

/* Proceed to the next data dictionary item */

fgets(datadict_string, STRING, datadict_file) ;

Domain_Info.attributes++ ;

if (Domain_Info.attributes > MAX_ATTRIBUTES) {

fprintf(stderr,

"ERROR: number of attributes is greater than MAX [%d]\n",

MAX_ATTRIBUTES);

exit(1) ;

}

/*******************************************************/

/* LOCATE THE RESPONSE VARIABLE IN THE DATA DICTIONARY */

/*******************************************************/

Domain_Info.resp_attr = -1 ; /* default is out of bounds */

for (i=0; i<Domain_Info.attributes; i++) {

/* if (debug) fprintf(stderr, "[%s]^[%s]\n", Response_Attribute,

Domain_Info.Datadict[i].name) ; */

if(!strcmp(Response_Attribute,

Domain_Info.Datadict[i].name))

Domain_Info.resp_attr=i ;

}

if (Domain_Info.resp_attr == -1) {

fprintf(stderr, "ERROR: The response attribute [%s] ",

Response_Attribute);

fprintf(stderr, "was not found in the data dictionary\n");

exit(-1);

}

else if (Domain_Info.Datadict[Domain_Info.resp_attr].type != NOMINAL) {

fprintf(stderr, "ERROR: The response attribute [%s] ",

Response_Attribute);

fprintf(stderr, "must be of type [%s]\n", NOMINAL_STRING);

exit(-1);

}

else if(debug) {

fprintf(stderr,

"debug: response attr found. Datadict[%d] = [%s]\n",

Domain_Info.resp_attr,

Domain_Info.Datadict[Domain_Info.resp_attr].name) ;

}

/***********************************************

** INITIALIZE THE DATABASE COMMUNICATION CHANNEL

***********************************************/

if (!init_db(&DB_ptr, DB_Table_Name, Domain_Info)) {

fprintf(stderr, "ERROR: Unable to initialize the database\n");

exit(-1);

}

if (debug) fprintf(stderr, "debug: Domain=[%d]\n",

Domain_Info.Datadict[Domain_Info.resp_attr].size) ;

if (Domain_Info.Datadict[Domain_Info.resp_attr].size > MAX_DOMAIN_SIZE) {

fprintf(stderr,

"ERROR: The number of distinct values in the response attribute %s",

Response_Attribute) ;

fprintf(stderr,

"\t[%d] is greater than the current maximum allowed [%d].\n",

Domain_Info.Datadict[Domain_Info.resp_attr].size,

MAX_ATTRIBUTES) ;

exit(-1) ;

}

/***********************************************

** BEGIN PROCESSING EVENT VECTORS

***********************************************/

time_1=time(NULL) ;

fgets(test_string, STRING, test_file);

while (!feof(test_file)) {

attr_val event_vector[MAX_ATTRIBUTES];

char *token, *str_ptr ;

if ((str_ptr=strstr(test_string,".\n"))!=NULL) *str_ptr = (char)NULL ;

if ((str_ptr=strrchr(test_string,'\n'))!=NULL) *str_ptr = (char)NULL ;

if (verbose) {

fprintf(output_file,

"\n***********************************************************\n");

fprintf(output_file, "Make a prediction for test event #%d\n\n",

++test_counter);

}

if(debug) fprintf(stderr, "debug: process next test event [%s]\ndebug: ",

test_string) ;

for (token = strtok(test_string,","), i=0;

token != NULL ;

token = strtok(NULL, ","), i++ )

{

if (Domain_Info.Datadict[i].type == NOMINAL) {

int member, j;

/* get rid of any initial spaces */

while(*token==' ') token++ ;

if (!strcmp(token, "?"))

member=NOMINAL_NA ;

else {

/* BUG cannot handle any padded whitespaces */

for (j=0, member=NOMINAL_NA-1;

j<Domain_Info.Datadict[i].size;

j++)

if (!strcmp(token, Domain_Info.Datadict[i].members[j]))

member=j ;

if (member==NOMINAL_NA-1) {

fprintf(stderr, "ERROR in test file: [%s] ", token) ;

fprintf(stderr, "is not a member of attr [%s]\n",

Domain_Info.Datadict[i].name );

exit(-1);

}

event_vector[i].nominal = member ;

if (debug) fprintf(stderr, "e_v[%d][%s]->%d ",

i, token, member) ;

}

else {

float value ;

/* get rid of any initial spaces */

while(*token==' ') token++ ;

if (!strcmp(token, "?"))

value=CONTINUOUS_NA ;

else {

if (sscanf(token, "%f", &value) != 1) {

fprintf(stderr, "ERROR in test entry [%s]\n", test_string) ;

fprintf(stderr, "\t[%s] for attr [%s] is not numerical.\n",

token, Domain_Info.Datadict[i].name );

exit(-1);

}

if (value>Domain_Info.Datadict[i].max) {

fprintf(stderr, "ERROR in test entry %s:\n",test_string) ;

fprintf(stderr,

"\t%f for attr %sis larger than the domain max %f.\n",

value, Domain_Info.Datadict[i].name,

Domain_Info.Datadict[i].max);

exit(-1);

}

else if (value<Domain_Info.Datadict[i].min) {

fprintf(stderr, "ERROR in test entry %s:\n",test_string) ;

fprintf(stderr,

"\t%f for attr %sis smaller than the domain min %f.\n",

value, Domain_Info.Datadict[i].name,

Domain_Info.Datadict[i].min);

exit(-1);

}

if (debug) fprintf(stderr, "e_v[%d][%s]->%f ",

i, token, value) ;

event_vector[i].continuous = value ;

}

} /* end of foreach attribute-value */

/* complete the debug line */

if (debug) fprintf(stderr, "\n") ;

/*******************************************************

** DISCOVER THE PREDICTED RULE FOR THIS EVENT VECTOR **

*******************************************************/

{

int i ;

rule_type Null_Rule ; /* The prediction if no info given */

rule_type Max_Test_Rule ;

rule_type Test_Rule ;

/*****************************************

** DO A TOP DOWN SEARCH UNTIL LOCAL MAXIMA

*****************************************/

if(debug) fprintf(stderr,

"debug: PREDICT (Top Down Search Until Local Maxima)\n") ;

/* Get and keep the overall distribution for future usage*/

bzero(&Null_Rule, sizeof(rule_type)) ;

clear_rule(Null_Rule.db_priv) ;

create_rule(DB_ptr, &Domain_Info, &Null_Rule);

/* clear_rule(Null_Rule.db_priv) ;*/

Domain_Info.N = Null_Rule.cover ;

/* Start the Max_Rule out as the Null_Rule */

bcopy(&Null_Rule, &Max_Rule, sizeof(struct classifier_str) ) ;

/* while specializations are possible */

bzero(&Max_Test_Rule, sizeof(rule_type)) ;

Max_Test_Rule.value = 1;

while (Max_Test_Rule.value > min_measure ) {

if(debug)fprintf(stderr,"debug: perform a round of specialization\n");

Max_Test_Rule.value = 0 ;

/* try to specialize on every given attribute-value */

/* BUG: this should be a random selection so that ties */

/* do not necessarily go the the first attribute */

for (i = 0; i < Domain_Info.attributes; i++) {

/* is this a candidate term for conjunction? */

if ( i != Domain_Info.resp_attr

&& event_vector[i].nominal != NOMINAL_NA

&& event_vector[i].continuous != CONTINUOUS_NA ) {

bcopy(&Max_Rule, &Test_Rule, sizeof(rule_type)) ;

/* update the size of the antecedent if prev unused */

if (!Test_Rule.attribute_map[i]) Test_Rule.a_size++ ;

if (Domain_Info.Datadict[i].type == NOMINAL ) {

Test_Rule.Terms[i].nominal = event_vector[i].nominal ;

/* if (debug) fprintf(stderr, "assigning %d to term %d\n",

t_member, i); */

}

else {

float min, max, rmin, rmax ;

float delta ;

/* if this is the first specialization on this

non-nominal attribute then use the domain bounds */

if (!Test_Rule.attribute_map[i])

delta = Domain_Info.Datadict[i].max -

Domain_Info.Datadict[i].min ;

else

delta = Test_Rule.Terms[i].min_max[1] -

Test_Rule.Terms[i].min_max[0] ;

/* prune the range accordingly*/

delta /= num_part_ratio ;

delta /= 2.0 ; /* half on each side */

min = rmin = event_vector[i].continuous - delta ;

max = rmax = event_vector[i].continuous + delta ;

/* Round off ordinal types */

if (Domain_Info.Datadict[i].type == ORDINAL ) {

min = (float)floor(rmin+0.49999) ;

max = (float)floor(rmax+0.49999) ;

}

Test_Rule.Terms[i].min_max[MIN]=min ;

Test_Rule.Terms[i].min_max[MAX]=max ;

} /* end of term composition */

/* signal that this attribute has been used */

Test_Rule.attribute_map[i]++ ;

/* update the text Rule */

if (verbose) set_rule_description(&Test_Rule, Domain_Info) ;

/*****************************************************/

/* call SPECIALIZE_RULE() */

/* get the rule's cover and consequent from the data */

/* create_rule(DB_ptr, &Domain_Info, &Test_Rule); */

specialize_rule(DB_ptr, &Domain_Info, &Test_Rule, i);

/*****************************************************/

/* call EVALUATION_FUNCTION() */

/* get the quality of this rule */

if (Test_Rule.cover >= min_cover) {

eval_fn(&Test_Rule, &Max_Rule, Domain_Info,

eval_function) ;

}

else {

Test_Rule.value = 0 ;

}

/* is this test rule better than the current best test */

if (Max_Test_Rule.value < Test_Rule.value) {

if (debug) fprintf(stderr,

"NEW WINS *** %f < %f\tcover %d\n",

Max_Test_Rule.value, Test_Rule.value,

Test_Rule.cover);

clear_rule(Max_Test_Rule.db_priv) ;

bcopy(&Test_Rule, &Max_Test_Rule,

sizeof(rule_type) ) ;

}

else {

clear_rule(Test_Rule.db_priv) ;

if (debug) fprintf(stderr,

"old max_test stays %f < %f\tcover %d\n",

Max_Test_Rule.value, Test_Rule.value,

Test_Rule.cover);

}

else { /* may want to find out why this term was rejected */

if (debug) fprintf(stderr,

"debug: addition of term [%d] to rule [%s] FAILED.\n",

i, Max_Rule.Rule) ;

}

} /* go back and try any other potential term to specialize on */

if(debug)fprintf(stderr,"debug: end round of specialization\n");

/* Have we really specialized any further? */

if (Max_Test_Rule.value > min_measure) {

clear_rule(Max_Rule.db_priv);

/* update the max rule to be the winning test rule*/

bcopy(&Max_Test_Rule, &Max_Rule, sizeof(rule_type) ) ;

bzero(&Max_Test_Rule, sizeof(rule_type)) ;

Max_Test_Rule.value = 1;

if (debug) {

fprintf(stderr,

"NEW BEST RULE: COVER: %d\tValue %f\t\n",

Max_Rule.cover, Max_Rule.value) ;

print_rule(output_file, Domain_Info, &Max_Rule);

test_rule(output_file, &Domain_Info, &Max_Rule,

Domain_Info.Datadict[Domain_Info.resp_attr].members[(int)event_vector[Domain_Info.resp_attr].nominal],

test_accuracy);

}

} /* while specializations are possible) */

if(debug)fprintf(stderr,"debug: Max Rule found - cover %d !> %d\n",

Max_Rule.cover, Max_Test_Rule.cover) ;

} /* end of predictions section */

/***********************************************************/

/** PROCESS THE PREDICTED (OR SELECTED) RULE AS REQUESTED **/

/***********************************************************/

if(verbose)

print_rule(output_file, Domain_Info, &Max_Rule);

/* Report whether this prediction was correct to the appropriate degree

given that the correct value has been provided in the event vector */

if(test_accuracy && event_vector[Domain_Info.resp_attr].nominal!=NOMINAL_NA ) {

int resp_attr=Domain_Info.resp_attr ;

int member ;

member = event_vector[resp_attr].nominal ;

if (debug) fprintf(stderr, "%d\t%d\t[%s]\n", resp_attr, member,

Domain_Info.Datadict[resp_attr].members[member]) ;

test_rule(output_file, &Domain_Info, &Max_Rule,

Domain_Info.Datadict[resp_attr].members[member],

test_accuracy);

}

/* release this rule */

clear_rule(Max_Rule.db_priv) ;

/* prepare for the next test event */

fgets(test_string, STRING, test_file);

} /* while there are more test events*/

/***********************************************/

/** FINISH **/

/***********************************************/

if (debug) fprintf(stderr, "debug: about to FINISH\n") ;

db_exit(DB_ptr) ;

if (output_file != stdout) fclose(output_file) ;

exit(0) ;

} /* EXIT main() */

/*****************************************************************

******************************************************************

** SUPPORT FUNCTIONS **

******************************************************************

*****************************************************************/

/*****************************************************************************

** Class Separation Distance Measure

** ORT measure proposed in Fayyad,Irani[AAAI-92]

** This is the cosine of the angle between the two distributions

*****************************************************************************/

int eval_fn(

rule_type *TR , /* Test Rule */

rule_type *MR , /* Max Rule */

domain_type DI , /* Domain Info */

int measure

)

{

int i ;

int classes = DI.Datadict[DI.resp_attr].size;

double TR_cover=TR->cover ;

double MR_cover=MR->cover ;

double TRC_cover ;

double TRC_ratio=0 ;

double TR_ratio ; /* store into variable for efficiency */

double MR_ratio ; /* store into variable for efficiency */

double sum_abab = 0 ; /* Sum along dimensions of squares of (TR*TRC)'s */

double sum_ab2 = 0 ; /* Sum along dimensions of squares of (TR-TRC)'s */

double sum_a2 = 0 ; /* Sum along dimensions of squares of TR's */

double sum_b2 = 0 ; /* Sum along dimensions of squares of TRC's */

/* for entropy calc */

/* double log10_2 = log10(2) ; hardcode to calc base2 logs */

double entropy_p = 0 ;

double entropy_l = 0 ;

double entropy_r = 0 ;

double entr_norm = 0 ; /* entropy normalization quotient */

/* does this rule (or its complement) have a cover? */

if ( (TR_cover==0) || (MR_cover-TR_cover==0) ) {

TR->value = (float)0 ;

if (debug) fprintf(stderr, "debug: Everything matched!\n") ;

return(0) ;

}

/* get the normalization quotient for this entropy measure version */

if (measure==6) {

/* foreach class ratio */

for (i=0; i<classes; i++) {

/* if the ratio is zero then add zero (nothing) */

if (TR->conseq_ratios[i] > 0.00001)

entr_norm += TR->conseq_ratios[i]/MR->conseq_ratios[i] ;

}

/* foreach class value accumulate the differences */

for (i=0; i<classes; i++) {

TR_ratio = TR->conseq_ratios[i] ; /* make local for efficiency */

MR_ratio = MR->conseq_ratios[i] ; /* make local for efficiency */

/* Set the TRC_ratio based on the parents and sister's distributions*/

TRC_cover = MR_ratio*MR_cover - TR_ratio*TR_cover ;

TRC_ratio = TRC_cover / (MR_cover - TR_cover) ;

if (debug) fprintf(stderr, "%f %f %f %f %f %f\n",

MR->conseq_ratios[i], MR->conseq_ratios[i]*MR->cover,

TR_ratio, TR_ratio*TR_cover,

TRC_ratio, TRC_cover ) ;

/* for Cos() */

if (measure==1) {

sum_abab += ( TR_ratio * TRC_ratio ) ;

sum_a2 += ( TR_ratio * TR_ratio ) ;

sum_b2 += ( TRC_ratio * TRC_ratio ) ;

}

/* for Cos2() */

if (measure==4) {

sum_abab += ( TR_ratio * MR_ratio ) ;

sum_a2 += ( TR_ratio * TR_ratio ) ;

sum_b2 += ( MR_ratio * MR_ratio ) ;

}

/* for Distance() */

if (measure==2) {

sum_ab2 += (TR_ratio - TRC_ratio)*(TR_ratio - TRC_ratio);

/* fprintf(stderr,

"%g <-+ (%g - %g)^2\n", sum_ab2, TR_ratio, TRC_ratio);

*/

}

/* for Distance2() */

if (measure==5) {

sum_ab2 += (TR_ratio - MR_ratio)*(TR_ratio - MR_ratio);

}

/* for Entropy() */

if (measure==3) { /* test for small so that 0*log(0) = 0 */

if (MR_ratio > 0.00001)

entropy_p -= MR->conseq_ratios[i] * log10(MR->conseq_ratios[i])

/ LOG10_2 ;

if (TR_ratio > 0.00001)

entropy_r -= TR_ratio * log10(TR_ratio)/LOG10_2 ;

if (TRC_ratio > 0.00001)

entropy_l -= TRC_ratio * log10(TRC_ratio)/LOG10_2 ;

/* fprintf(stderr, "_p %g , _r %g, _l %g\n",

entropy_p, entropy_r, entropy_l); */

}

/* for Entropy() */

if (measure==6) { /* test for small so that 0*log(0) = 0 */

double norm_ratio ;

if (MR_ratio >= 0) { /* i.e. always */

/* normalized parent's ratio = 1/classes by definition */

norm_ratio = 1.0/classes ;

entropy_p -= norm_ratio * log10(norm_ratio)/LOG10_2 ;

}

if (TR_ratio > 0.00001) {

norm_ratio = (TR_ratio/MR_ratio) / entr_norm ;

entropy_r -= norm_ratio * log10(norm_ratio)/LOG10_2 ;

}

/* printf("") ; */

}

/** Cos() based i.e. ORT */

if ((measure==1) || (measure==4)) {

/** calculate the cosine of the angle between the two distr. vectors */

double cos_angle = sum_abab / (sqrt(sum_a2)*sqrt(sum_b2));

TR->value = 1 - cos_angle ;

if (debug) fprintf(stderr, "%g <- 1 - (%g / (sqrt(%g) * sqrt(%g)))\n",

TR->value, sum_abab, sum_a2, sum_b2 ) ;

}

/** Euclidean distance() based */

if ((measure == 2) || (measure == 5)) {

/** calc distance between the two vectors */

/** normalize [0,sqrt(2)] to [0,1] */

TR->value = sqrt(sum_ab2)/SQRT_2 ;

if (debug) fprintf(stderr, "%f <- sqrt(%g)/%g\n",

TR->value, sum_ab2, sqrt(2) ) ;

}

/** Entropy() based */

if (measure == 3) {

/* probability of each path */

double p_l = (MR_cover - TR_cover)/MR_cover ;

double p_r = (TR_cover)/MR_cover ;

/** parents entropy minus the portion of the children's entropy */

TR->value = entropy_p - p_l*entropy_l - p_r*entropy_r ;

if (debug) fprintf(stderr, "%f <- %f - %g*%g - %g*%g\n",

TR->value, entropy_p, p_l, entropy_l, p_r, entropy_r ) ;

}

/** Entropy2() based */

if (measure == 6) {

/** parents entropy minus the the child's entropy */

TR->value = entropy_p - entropy_r ;

/* if (debug) fprintf(stderr, "%g <- %g - %g\n",

TR->value, entropy_p, entropy_r ) ; */

}

if(debug) fprintf(stderr, "heuristic->%f\n", TR->value) ;

return (0) ;

}

/*****************************************************************************

** print a report of the rule consequent

*****************************************************************************/

void print_rule(

FILE *fd ,

domain_type DI , /* Domain_Info */

rule_type *Rule

)

{

struct test TEST[MAX_ATTRIBUTES]; /* Ratios repository to be sorted */

int classes = DI.Datadict[DI.resp_attr].size ;

char Quantity[13] ;

char Ratio[13] ;

int i ;

if (debug) fprintf(stderr,"debug: enter print_rule()\n") ;

/* prepare the test table */

bzero(&TEST, sizeof(TEST)) ;

for(i=0; i<classes; i++) {

TEST[i].index = i ;

TEST[i].ratio = Rule->conseq_ratios[i] ;

}

/* qsort into descending order */

qsort (&TEST, classes, sizeof(struct test), ratio_compare) ;

/* print the banner */

if (Rule->Rule != NULL)

fprintf(fd, "BECAUSE\n%s\n", Rule->Rule );

else

fprintf(fd, "%s\n", NULL_RULE_DESC) ;

fprintf(fd, "PREDICT\n");

fprintf(fd, "%-25s\t%-13s\t%-13s\n",

DI.Datadict[DI.resp_attr].name,

"Quantity", "Ratio" );

fprintf(fd, "%-25s\t%-13s\t%-13s\n", "-------------------------", "-------------", "-------------" );

/* print out the consequent */

for (i=0; i<classes; i++) {

/* get the quantity and ratio for this class */

sprintf(Quantity, "%d", /* round this number */

(int)floor(0.5 + (Rule->cover * TEST[i].ratio) ) ) ;

sprintf(Ratio, "%f", TEST[i].ratio ) ;

fprintf(fd, "%-25s\t%-13s\t%-13s\n",

DI.Datadict[DI.resp_attr].members[TEST[i].index],

Quantity, Ratio ) ;

}

/*****************************************************************************

** print the correctness of the rule consequent

** BUG: What to return when equally probable predictions arise (e.g. 50:50:0)?

*****************************************************************************/

void test_rule(

FILE *fd ,

domain_type *DI ,

rule_type *Rule ,

char *True_Response ,

int within /* the placement of the correct class in */

/* the consequent must be within this value. */

)

{

struct test TEST[MAX_ATTRIBUTES]; /* Ratios repository to be sorted */

int found_at=MAX_ATTRIBUTES+1 ;

int i ;

char result[TOKEN] ;

int classes = DI->Datadict[DI->resp_attr].size ;

if (debug) fprintf(stderr,"debug: enter test_rule()\n") ;

/* prepare the test table */

bzero(&TEST, sizeof(TEST)) ;

for(i=0; i<classes; i++) {

TEST[i].index = i ;

TEST[i].ratio = Rule->conseq_ratios[i] ;

}

/* qsort into descending order */

qsort (&TEST, classes, sizeof(struct test), ratio_compare) ;

/* process the result */

for(i=0; i<classes; i++) {

if (debug) fprintf(stderr,"debug: [%s]^[%s]",

DI->Datadict[DI->resp_attr].members[TEST[i].index],

True_Response) ; /* BUG */

if ( !strcmp(DI->Datadict[DI->resp_attr].members[TEST[i].index],

True_Response) ) {

found_at=i ;

if (debug) fprintf(stderr, "\tcorrect match\n" ) ;

}

else

if (debug) fprintf(stderr, "\tdon't match\n" ) ;

}

if (found_at<within) strcpy(result, CORRECT_STRING) ;

else strcpy(result, WRONG_STRING) ;

if (verbose) {

fprintf(fd, "\n%8s %20s %8s %20s\n",

"Result", "Antecedent Size", "Cover", "True Response");

fprintf(fd, "%8s %20s %8s %20s\n", "--------",

"--------------------", "--------", "--------------------") ;

fprintf(fd, "%8s %20d %8d %20s\n", result, Rule->a_size, Rule->cover,

True_Response ) ;

}

else {

time_2=time(NULL) ;

fprintf(fd, "%s\t%d\t%d\t%d\n", result, Rule->a_size, Rule->cover, (int)(time_2-time_1)) ;

time_1=time_2 ;

}

fflush(fd) ;

}

/*****************************************************************************

** used as the sorting function in qsort()

*****************************************************************************/

static int ratio_compare(

const void *i_ ,

const void *j_

)

{

struct test *i=(struct test *)i_ ;

struct test *j=(struct test *)j_ ;

int value ;

value = 9999999 * (j->ratio - i->ratio);

if (debug) fprintf(stderr, "debug: %f\t%f -> %d\n",

j->ratio, i->ratio, value) ;

return (value);

}

static void set_rule_description (

rule_type *Rule,

domain_type DI

)

{

int i;

char term[TOKEN];

int first=0;

for(i=0; i<DI.attributes; i++) {

if (Rule->attribute_map[i]) {

char *attribute=DI.Datadict[i].name ;

if (first++) /* if first then clean out o/wise slap an AND */

strcat(Rule->Rule, " AND ") ;

else

strcpy(Rule->Rule, "") ;

if (DI.Datadict[i].type == NOMINAL) {

#if DBMS==1

/* SQL FORM */

sprintf(term, "%s=[%s]", attribute,

DI.Datadict[i].members[(int)Rule->Terms[i].nominal]) ;

#else

sprintf(term, "%s=\"%s\"", attribute,

DI.Datadict[i].members[(int)Rule->Terms[i].nominal]) ;

#endif

}

else {

#if DBMS==1

sprintf(term, "%s IN [%g, %g]" ,

attribute , Rule->Terms[i].min_max[MIN] ,

Rule->Terms[i].min_max[MAX] ) ;

#else

/* SQL FORM */

sprintf(term, "%s>=%g AND %s<=%g" ,

attribute , Rule->Terms[i].min_max[MIN] ,

attribute , Rule->Terms[i].min_max[MAX] ) ;

#endif

}

strcat(Rule->Rule, term) ;

}