DBPredictor
http://www.cs.sfu.ca/~melli/DBPredictor/
/*********************************************************************
**********************************************************************
** DEFINED DEFAULTS
**********************************************************************
*********************************************************************/
#define MAX_ATTRIBUTES 81
#define TOKEN 256
#define STRING 2056
#define MAX_DOMAIN_SIZE 64
#define NOMINAL_STRING "nominal"
#define ORDINAL_STRING "ordinal"
#define CONTINUOUS_STRING "continuous"
#define NOMINAL 0
#define ORDINAL 1
#define CONTINUOUS 2
#define MAX_ROWS 200000 /* for in memory (signed short int) */
#define NOMINAL_NA -1
#define CONTINUOUS_NA -999 /* as per C4.5 */
#define WRONG_STRING "WRONG"
#define CORRECT_STRING "CORRECT"
#define MIN 0
#define MAX 1
/*********************************************************************
**********************************************************************
** GLOBAL STRUCTURES
**********************************************************************
*********************************************************************/
typedef union
{
char nominal ; /* id of the attribute member */
float continuous ;
} attr_val ;
typedef union
{
char nominal ; /* id of the attribute member */
float min_max[2] ;
} term ;
/* Rule structure */
typedef struct classifier_str {
term Terms[MAX_ATTRIBUTES] ;
char attribute_map[MAX_ATTRIBUTES]; /* degree of conj on this attr */
char antecedent_order[MAX_ATTRIBUTES]; /* order of specialization for this rule */
int a_size ; /* number of conjunctions (ANDs) in this rule */
float value ; /* rule quality value */
int cover ; /* rule cover BUG - change name to cover */
char Rule[STRING] ; /* text based rule (SQL WHERE clause) */
float conseq_ratios[MAX_DOMAIN_SIZE] ;
void *db_priv ; /* pointer available for the private use
of the db functions. e.g. to pass info
from create_rule to specialize_rule. */
} rule_type ;
typedef struct attribute_str
{
char name[TOKEN];
short type ;
short size ;
/* BUG: a union would be more efficient for these two */
/* BUG: domain members should be in alpha ascending order */
char members[MAX_DOMAIN_SIZE][TOKEN] ; /* for nominal */
float min ; /* for ordinal and continuous */
float max ; /* for ordinal and continuous */
} attribute_type ;
typedef struct domain_str {
short attributes ; /* number of attribute in DB_Table */
short resp_attr ;
attribute_type Datadict[MAX_ATTRIBUTES] ;
long N ; /* number of records in DB_Table */
/* char Response_Attribute[MAX_STRING] ;
char Resp_Attr_Dom[MAX_ATTRIBUTES][MAX_STRING] ;
short resp_attr_dom_size ; */
} domain_type ;
/*********************************************************************
**********************************************************************
** FORWARD DECLARATION
**********************************************************************
*********************************************************************/
extern int init_db (
void **, char *, domain_type
) ;
extern int create_rule (
void *, domain_type *, rule_type *
) ;
extern int specialize_rule (
void *, domain_type *, rule_type *, int
) ;
extern int clear_rule (
void *
) ;
extern int db_exit (
void *
) ;
/* make this flag available to the other objects */
extern int debug ;
////////////////////////////////////////////////////////////////////////////////////////////
#define AUTHOR "Gabor_Melli@cs.sfu.ca"
#define DATE "1998/03/28" /* string */
#define VERSION 2.0 /* real number */
/*****************************************************************
** Attribute-Value Prediction (DBPredictor) program **
** **
** (c) 1998 Gabor Melli **
** School of Computing Science / Simon Fraser University **
** **
** This code is freely available for personal or academic use. **
** Express written consent from the author is required to use **
** this program in a commercial setting. **
** **
** Comments and suggestions are very welcome! **
** **
** for further information please refer to **
** http://www.cs.sfu.ca/~melli/DBPredictor **
** **
******************************************************************
** **
** **
** EXIT STATUS: **
** 0: successful **
** -1: unsuccessful **
** **
** NEW **
** **
** TO DO: http://.../DBPredictor/to_do.html **
** **
** QUESTIONS: http://.../DBPredictor/questions.html **
** **
** BUGS: **
** **
*****************************************************************/
/*****************************************************************
** PROGRAM STRUCTURE **
** **
** INCLUDE FILES **
** DEFINED DEFAULTS **
** SYNTAX MESSAGE **
** GLOBAL STRUCTURES **
** GLOBAL VARIABLES **
** FORWARD DECLARATION **
** MAIN **
** - Process the Program Parameters **
** - Test the Selected Parameter Settings **
** - Report of the Variable Settings (when verbose) **
** - PEFORM PREDICTION **
** **
** SUPPORT PROCEDURES **
** init_db() **
** create_overall() **
** create_rule() **
** print_rule() **
** test_rule() **
** C_SEP_measure() **
** ratio_compare() **
** err_handler() **
** msg_handler() **
*****************************************************************/
/*****************************************************************
******************************************************************
** INCLUDE FILES **
******************************************************************
*****************************************************************/
#include <stdio.h>
#include <math.h> /* sqrt(), ceil() */
#include <stdlib.h> /* atoi() */
#include <string.h> /* bcopy(), bzero() */
#include <unistd.h> /* getopt() */
#include <time.h> /* time() */
#include "dbpredictor.h" /* typedefs and globals */
/*********************************************************************
**********************************************************************
** LOCAL DEFINITIONS
**********************************************************************
*********************************************************************/
/* determined through empirical study */
#define NUM_PART_RATIO 2.0 /* def. numerical partition ratio */
#define MIN_COVER 1 /* def. smallest rule support considered */
#define MIN_MEASURE 0.0 /* def. smallest interestingness considered */
#define DEF_EVAL_FN 5 /* def. evaluation function */
#define LOG10_2 0.301029996
#define SQRT_2 1.414213562
#define NULL_RULE_DESC "Overall Distribution"
/*********************************************************************
**********************************************************************
** GLOBAL STRUCTURES AND VARIABLES
**********************************************************************
*********************************************************************/
struct test { /* used to temporarily sort the consequent distribution */
int index ;
double ratio ;
} ;
int debug=0;
static int verbose=0;
static time_t time_1,time_2 ; /* help to report duration between preds */
/*********************************************************************
**********************************************************************
** SYNTAX MESSAGE
**********************************************************************
*********************************************************************/
#define USAGE \
fprintf(stderr, "\nSYNTAX: %s [-bdhisv] -tDFR option\n\n", argv[0]) ; \
fprintf(stderr, "\td:\tdebug msgs to stderr\n") ; \
fprintf(stderr, "\th:\thelp\n") ; \
fprintf(stderr, "\ti:\tinteractive (one at a time vs batch)\n") ; \
fprintf(stderr, "\tv:\tverbose\n") ; \
fprintf(stderr, "\n") ; \
fprintf(stderr, "\tt:\tX (report accuracy within X)\n") ; \
fprintf(stderr, "\tD:\tDatabase Table\n") ; \
fprintf(stderr, "\tf:\tFilestem\n") ; \
fprintf(stderr, "\tR:\tResponse attribute to predict\n") ; \
fprintf(stderr, "\te:\tevaluation function [%d]\n", DEF_EVAL_FN) ; \
fprintf(stderr, "\tm:\tmin heuristic measure [%g]\n", MIN_MEASURE) ; \
fprintf(stderr, "\tc:\tmin cover [%d]\n", MIN_COVER) ; \
fprintf(stderr, "\tn:\tnumerical attr div ratio [%g]\n", NUM_PART_RATIO) ; \
fprintf(stderr, "\n") ; \
fprintf(stderr, "\tExample\n") ; \
fprintf(stderr, "\t%s -f iris -Rclass -v\n",argv[0]) ; \
fprintf(stderr, "\n") ;
/*********************************************************************
**********************************************************************
** FORWARD DECLARATION
**********************************************************************
*********************************************************************/
extern void bzero(void *, int) ;
extern void bcopy (const void *, void *, int) ;
extern int getopt(int, char * const [], const char *) ;
extern char *strtok(char *, const char *) ;
static void test_rule(
FILE *, struct domain_str *, struct classifier_str *, char *, int ) ;
static void print_rule(
FILE *, struct domain_str, struct classifier_str * ) ;
static int eval_fn(
struct classifier_str *, struct classifier_str *, struct domain_str, int ) ;
/* static int ratio_compare (struct test *, struct test * ) ; */
static int ratio_compare (const void *, const void *) ;
static void set_rule_description (rule_type *, domain_type) ;
/*****************************************************************
******************************************************************
** MAIN **
******************************************************************
*****************************************************************/
int main(int argc, char *argv[])
{
FILE *datadict_file=NULL ;
FILE *test_file=NULL ;
FILE *output_file=NULL;
char Filestem[100] ;
char Datadict_filename[256] ;
char Output_filename[256] ;
char Tests_filename[256] ;
char DB_Table_Name[256] ;
char Response_Attribute[TOKEN] = {0} ;
int min_cover = MIN_COVER ;
float min_measure = MIN_MEASURE ;
float num_part_ratio = NUM_PART_RATIO ;
int eval_function = DEF_EVAL_FN ;
int i ; /* counter */
int interactive=1 ;
char datadict_string[STRING] ;
char test_string[STRING] ; /* holds event vector values */
char test_accuracy=1 ; /* default: best class accuracy */
void *DB_ptr;
int test_counter=0 ;
domain_type Domain_Info ;
rule_type Max_Rule ;
/***********************************************
** INITIALIZE
***********************************************/
DB_Table_Name[0] = (char)NULL ;
Filestem[0] = (char)NULL ;
Output_filename[0] = (char)NULL ;
Response_Attribute[0] = (char)NULL ;
/***********************************************
** DECOMPOSE THE PROGRAM ARGUMENTS
***********************************************/
{
char c ; /* parameter flag character */
extern char *optarg ;
while ((c = getopt(argc, argv, "dhisvt:f:D:R:n:c:m:e:")) != -1)
switch(c)
{
case 'd': /* Debug */
debug=1 ;
verbose=1 ;
break ;
case 'h': /* Simple help message */
USAGE ;
exit(-1) ;
case 'i': /* Interactive instead of batch */
interactive=1 ;
break ;
case 't': /* Test accuracy instead of report rules */
test_accuracy=atoi(optarg) ;
break ;
case 'v': /* Verbose */
verbose=1 ;
break ;
case 'f': /* file prefix */
strcpy(Filestem, optarg) ;
break ;
case 'D': /* Database table */
strcpy(DB_Table_Name, optarg) ;
break ;
case 'R': /* Response attribute to predict */
strcpy(Response_Attribute, optarg) ;
break ;
case 'c': /* Minimum cover parameter */
sscanf(optarg, "%d", &min_cover) ;
break ;
case 'm': /* Minimum measure */
sscanf(optarg, "%f", &min_measure) ;
break ;
case 'n': /* Response attribute to predict */
sscanf(optarg, "%f", &num_part_ratio) ;
break ;
case 'e': /* Evaluation function */
if (!strcmp(optarg,"cos"))
eval_function = 1;
else if(!strcmp(optarg,"dist"))
eval_function = 2;
else if(!strcmp(optarg,"entropy"))
eval_function = 3;
else if(!strcmp(optarg,"cos2"))
eval_function = 4;
else if(!strcmp(optarg,"dist2"))
eval_function = 5;
else if(!strcmp(optarg,"entropy2"))
eval_function = 6;
else {
fprintf(stderr,"ERROR: Invalid parameter -e %s\n", optarg);
exit(-1);
}
break ;
}
}
/* Report the version information in verbose mode */
if (verbose) fprintf(stdout,
"v%4.2f(%s)\n - Min Measure %g\tMin Cover:%d\tNum Prun: %g\tEval Function:%d\n\n",
VERSION, DATE, min_measure, min_cover, num_part_ratio, eval_function );
/*****************************************************/
/* TEST FOR APPROPRIATE VALUES & REQUIRED PARAMETERS */
/*****************************************************/
if (Response_Attribute == (char)NULL) {
fprintf(stderr, "ERROR: Response attribute parameter (-R) not set\n");
exit(-1);
}
if (Filestem[0] == (char)NULL) {
fprintf(stderr, "ERROR: Filestem parameter (-f) not set\n");
exit(-1);
}
/*
if (test_accuracy < 1) {
fprintf(stderr, "ERROR: test accuracy of [%d] is invalid\n",
test_accuracy );
exit(-1);
}
*/
/***********************************************
** SETUP AND TEST I/O FILES
***********************************************/
#if DBMS==1
if (debug) fprintf(stderr, "FILE-based dataset\n") ;
if (DB_Table_Name[0] == (char)NULL) {
sprintf(DB_Table_Name, "%s.data", Filestem) ;
}
#elif DBMS==2
if (debug) fprintf(stderr, "SYBASE DB-LIB based dataset\n") ;
if (DB_Table_Name[0] == (char)NULL) {
sprintf(DB_Table_Name, "%s", Filestem) ;
}
#endif
sprintf(Datadict_filename, "%s.dict", Filestem) ;
if ((datadict_file = fopen(Datadict_filename, "r")) == NULL) {
fprintf(stderr,
"ERROR: Could not open the data dictionary file: [%s]\n",
Datadict_filename);
exit(-1);
}
sprintf(Tests_filename, "%s.test", Filestem) ;
if ((test_file = fopen(Tests_filename, "r")) == NULL) {
fprintf(stderr,
"ERROR: Could not open the tests file: %s\n",
Tests_filename);
exit(-1);
}
if (interactive) { /* interactive sessions pipe the results to stdout */
strcpy(Output_filename, "<stdout>") ;
output_file = stdout ;
}
else {
sprintf(Output_filename, "%s.output", Filestem) ;
if ((output_file = fopen(Output_filename, "w")) == NULL) {
fprintf(stderr, "ERROR: Could not open output file: %s\n",
Output_filename);
exit(-1);
}
}
/***********************************************
** DEBUG
***********************************************/
if (debug) {
fprintf(stderr, "debug: \n") ;
fprintf(stderr, "debug: This is a selection query\n") ;
if (interactive) fprintf(stderr, "debug: Interactive session\n") ;
else fprintf(stderr, "debug: Batch session\n") ;
if (test_accuracy) fprintf(stderr, "debug: Test accuracy query\n") ;
else fprintf(stderr, "debug: Report rule query\n") ;
fprintf(stderr, "debug: DB_Table_Name = [%s]\n", DB_Table_Name) ;
fprintf(stderr, "debug: Response Attrribute = [%s]\n",
Response_Attribute) ;
fprintf(stderr, "debug: Data Dict. file = [%s]\n",
Datadict_filename) ;
fprintf(stderr, "debug: Tests file = [%s]\n",
Tests_filename) ;
fprintf(stderr, "debug: Output file = [%s]\n",
Output_filename) ;
fprintf(stderr, "debug: \n") ;
}
/***********************************************
** READ IN THE DATA DICTIONARY INFORMATION
***********************************************/
Domain_Info.attributes=0 ;
fgets(datadict_string, STRING, datadict_file) ;
while (!feof(datadict_file)) {
char *token ;
char attr_desc[MAX_ATTRIBUTES][TOKEN] ;
float min ;
float max ;
int tokens, i ;
/* prune the ".\n" ending from the buffer if it exists */
if ((token=strstr(datadict_string,".\n")) != NULL)
*token = (char)NULL ;
if ((token=strstr(datadict_string,"\n")) != NULL)
*token = (char)NULL ;
/* if (debug)fprintf(stderr, "debug: datadict_string %s\n", datadict_string); */
/* Place this attributes tokens into an array */
tokens=0 ;
token = strtok(datadict_string, ":") ;
strcpy(attr_desc[tokens], token) ;
for (token=strtok(NULL, ","), i=1 ;
token != NULL ;
token=strtok(NULL, ","), i++ ) {
/* get rid of any initial spaces */
while(*token==' ' || *token=='\t') token++ ;
strcpy(attr_desc[i], token) ;
if(debug)fprintf(stderr, " [%s]\n", attr_desc[i]);
}
tokens=i ;
if (tokens<3) {
fprintf(stderr, "ERROR in %s at line %d: Information missing.\n",
Datadict_filename, Domain_Info.attributes+1) ;
exit(-1);
}
strcpy(Domain_Info.Datadict[Domain_Info.attributes].name,
attr_desc[0]) ;
if (!strcmp(NOMINAL_STRING, attr_desc[1] ) )
Domain_Info.Datadict[Domain_Info.attributes].type = NOMINAL ;
else if (!strcmp(ORDINAL_STRING, attr_desc[1] ))
Domain_Info.Datadict[Domain_Info.attributes].type = ORDINAL ;
else if (!strcmp(CONTINUOUS_STRING, attr_desc[1] ) )
Domain_Info.Datadict[Domain_Info.attributes].type = CONTINUOUS ;
else {
fprintf(stderr, "ERROR in %s: invalid domain type %s.\n",
Datadict_filename, attr_desc[1]) ;
exit(-1);
}
/* Determine this attributes domain membership */
if (Domain_Info.Datadict[Domain_Info.attributes].type != NOMINAL) {
if (sscanf(attr_desc[2], "%f", &min) != 1) {
fprintf(stderr, "ERROR in %s: invalid min value [%s].\n",
Datadict_filename, attr_desc[2] ) ;
exit(-1) ;
}
else
Domain_Info.Datadict[Domain_Info.attributes].min = min ;
if (sscanf(attr_desc[3], "%f", &max) != 1) {
fprintf(stderr, "ERROR in %s: invalid max value [%s].\n",
Datadict_filename, attr_desc[2]) ;
exit(-1) ;
}
else
Domain_Info.Datadict[Domain_Info.attributes].max=max ;
/* if (debug) fprintf(stderr, "debug: min[%f], max[%f]\n",
Domain_Info.Datadict[Domain_Info.attributes].min,
Domain_Info.Datadict[Domain_Info.attributes].max); */
}
else {
for (i=2 ; i<tokens ; i++) {
strcpy(Domain_Info.Datadict[Domain_Info.attributes].members[i-2],
attr_desc[i]) ;
if (debug) fprintf(stderr, "[%s] ",
Domain_Info.Datadict[Domain_Info.attributes].members[i-2]) ;
}
Domain_Info.Datadict[Domain_Info.attributes].size = i-2 ;
}
if (debug) fprintf(stderr, "\ndebug: Datadict entry[%d] type[%d] size[%d] name[%s]\n",
Domain_Info.attributes,
Domain_Info.Datadict[Domain_Info.attributes].type,
Domain_Info.Datadict[Domain_Info.attributes].size,
Domain_Info.Datadict[Domain_Info.attributes].name
);
/* Proceed to the next data dictionary item */
fgets(datadict_string, STRING, datadict_file) ;
Domain_Info.attributes++ ;
if (Domain_Info.attributes > MAX_ATTRIBUTES) {
fprintf(stderr,
"ERROR: number of attributes is greater than MAX [%d]\n",
MAX_ATTRIBUTES);
exit(1) ;
}
}
/*******************************************************/
/* LOCATE THE RESPONSE VARIABLE IN THE DATA DICTIONARY */
/*******************************************************/
Domain_Info.resp_attr = -1 ; /* default is out of bounds */
for (i=0; i<Domain_Info.attributes; i++) {
/* if (debug) fprintf(stderr, "[%s]^[%s]\n", Response_Attribute,
Domain_Info.Datadict[i].name) ; */
if(!strcmp(Response_Attribute,
Domain_Info.Datadict[i].name))
Domain_Info.resp_attr=i ;
}
if (Domain_Info.resp_attr == -1) {
fprintf(stderr, "ERROR: The response attribute [%s] ",
Response_Attribute);
fprintf(stderr, "was not found in the data dictionary\n");
exit(-1);
}
else if (Domain_Info.Datadict[Domain_Info.resp_attr].type != NOMINAL) {
fprintf(stderr, "ERROR: The response attribute [%s] ",
Response_Attribute);
fprintf(stderr, "must be of type [%s]\n", NOMINAL_STRING);
exit(-1);
}
else if(debug) {
fprintf(stderr,
"debug: response attr found. Datadict[%d] = [%s]\n",
Domain_Info.resp_attr,
Domain_Info.Datadict[Domain_Info.resp_attr].name) ;
}
/***********************************************
** INITIALIZE THE DATABASE COMMUNICATION CHANNEL
***********************************************/
if (!init_db(&DB_ptr, DB_Table_Name, Domain_Info)) {
fprintf(stderr, "ERROR: Unable to initialize the database\n");
exit(-1);
}
if (debug) fprintf(stderr, "debug: Domain=[%d]\n",
Domain_Info.Datadict[Domain_Info.resp_attr].size) ;
if (Domain_Info.Datadict[Domain_Info.resp_attr].size > MAX_DOMAIN_SIZE) {
fprintf(stderr,
"ERROR: The number of distinct values in the response attribute %s",
Response_Attribute) ;
fprintf(stderr,
"\t[%d] is greater than the current maximum allowed [%d].\n",
Domain_Info.Datadict[Domain_Info.resp_attr].size,
MAX_ATTRIBUTES) ;
exit(-1) ;
}
/***********************************************
** BEGIN PROCESSING EVENT VECTORS
***********************************************/
time_1=time(NULL) ;
fgets(test_string, STRING, test_file);
while (!feof(test_file)) {
attr_val event_vector[MAX_ATTRIBUTES];
char *token, *str_ptr ;
if ((str_ptr=strstr(test_string,".\n"))!=NULL) *str_ptr = (char)NULL ;
if ((str_ptr=strrchr(test_string,'\n'))!=NULL) *str_ptr = (char)NULL ;
if (verbose) {
fprintf(output_file,
"\n***********************************************************\n");
fprintf(output_file, "Make a prediction for test event #%d\n\n",
++test_counter);
}
if(debug) fprintf(stderr, "debug: process next test event [%s]\ndebug: ",
test_string) ;
for (token = strtok(test_string,","), i=0;
token != NULL ;
token = strtok(NULL, ","), i++ )
{
if (Domain_Info.Datadict[i].type == NOMINAL) {
int member, j;
/* get rid of any initial spaces */
while(*token==' ') token++ ;
if (!strcmp(token, "?"))
member=NOMINAL_NA ;
else {
/* BUG cannot handle any padded whitespaces */
for (j=0, member=NOMINAL_NA-1;
j<Domain_Info.Datadict[i].size;
j++)
if (!strcmp(token, Domain_Info.Datadict[i].members[j]))
member=j ;
if (member==NOMINAL_NA-1) {
fprintf(stderr, "ERROR in test file: [%s] ", token) ;
fprintf(stderr, "is not a member of attr [%s]\n",
Domain_Info.Datadict[i].name );
exit(-1);
}
}
event_vector[i].nominal = member ;
if (debug) fprintf(stderr, "e_v[%d][%s]->%d ",
i, token, member) ;
}
else {
float value ;
/* get rid of any initial spaces */
while(*token==' ') token++ ;
if (!strcmp(token, "?"))
value=CONTINUOUS_NA ;
else {
if (sscanf(token, "%f", &value) != 1) {
fprintf(stderr, "ERROR in test entry [%s]\n", test_string) ;
fprintf(stderr, "\t[%s] for attr [%s] is not numerical.\n",
token, Domain_Info.Datadict[i].name );
exit(-1);
}
if (value>Domain_Info.Datadict[i].max) {
fprintf(stderr, "ERROR in test entry %s:\n",test_string) ;
fprintf(stderr,
"\t%f for attr %sis larger than the domain max %f.\n",
value, Domain_Info.Datadict[i].name,
Domain_Info.Datadict[i].max);
exit(-1);
}
else if (value<Domain_Info.Datadict[i].min) {
fprintf(stderr, "ERROR in test entry %s:\n",test_string) ;
fprintf(stderr,
"\t%f for attr %sis smaller than the domain min %f.\n",
value, Domain_Info.Datadict[i].name,
Domain_Info.Datadict[i].min);
exit(-1);
}
}
if (debug) fprintf(stderr, "e_v[%d][%s]->%f ",
i, token, value) ;
event_vector[i].continuous = value ;
}
} /* end of foreach attribute-value */
/* complete the debug line */
if (debug) fprintf(stderr, "\n") ;
/*******************************************************
** DISCOVER THE PREDICTED RULE FOR THIS EVENT VECTOR **
*******************************************************/
{
int i ;
rule_type Null_Rule ; /* The prediction if no info given */
rule_type Max_Test_Rule ;
rule_type Test_Rule ;
/*****************************************
** DO A TOP DOWN SEARCH UNTIL LOCAL MAXIMA
*****************************************/
if(debug) fprintf(stderr,
"debug: PREDICT (Top Down Search Until Local Maxima)\n") ;
/* Get and keep the overall distribution for future usage*/
bzero(&Null_Rule, sizeof(rule_type)) ;
clear_rule(Null_Rule.db_priv) ;
create_rule(DB_ptr, &Domain_Info, &Null_Rule);
/* clear_rule(Null_Rule.db_priv) ;*/
Domain_Info.N = Null_Rule.cover ;
/* Start the Max_Rule out as the Null_Rule */
bcopy(&Null_Rule, &Max_Rule, sizeof(struct classifier_str) ) ;
/* while specializations are possible */
bzero(&Max_Test_Rule, sizeof(rule_type)) ;
Max_Test_Rule.value = 1;
while (Max_Test_Rule.value > min_measure ) {
if(debug)fprintf(stderr,"debug: perform a round of specialization\n");
Max_Test_Rule.value = 0 ;
/* try to specialize on every given attribute-value */
/* BUG: this should be a random selection so that ties */
/* do not necessarily go the the first attribute */
for (i = 0; i < Domain_Info.attributes; i++) {
/* is this a candidate term for conjunction? */
if ( i != Domain_Info.resp_attr
&& event_vector[i].nominal != NOMINAL_NA
&& event_vector[i].continuous != CONTINUOUS_NA ) {
bcopy(&Max_Rule, &Test_Rule, sizeof(rule_type)) ;
/* update the size of the antecedent if prev unused */
if (!Test_Rule.attribute_map[i]) Test_Rule.a_size++ ;
if (Domain_Info.Datadict[i].type == NOMINAL ) {
Test_Rule.Terms[i].nominal = event_vector[i].nominal ;
/* if (debug) fprintf(stderr, "assigning %d to term %d\n",
t_member, i); */
}
else {
float min, max, rmin, rmax ;
float delta ;
/* if this is the first specialization on this
non-nominal attribute then use the domain bounds */
if (!Test_Rule.attribute_map[i])
delta = Domain_Info.Datadict[i].max -
Domain_Info.Datadict[i].min ;
else
delta = Test_Rule.Terms[i].min_max[1] -
Test_Rule.Terms[i].min_max[0] ;
/* prune the range accordingly*/
delta /= num_part_ratio ;
delta /= 2.0 ; /* half on each side */
min = rmin = event_vector[i].continuous - delta ;
max = rmax = event_vector[i].continuous + delta ;
/* Round off ordinal types */
if (Domain_Info.Datadict[i].type == ORDINAL ) {
min = (float)floor(rmin+0.49999) ;
max = (float)floor(rmax+0.49999) ;
}
Test_Rule.Terms[i].min_max[MIN]=min ;
Test_Rule.Terms[i].min_max[MAX]=max ;
} /* end of term composition */
/* signal that this attribute has been used */
Test_Rule.attribute_map[i]++ ;
/* update the text Rule */
if (verbose) set_rule_description(&Test_Rule, Domain_Info) ;
/*****************************************************/
/* call SPECIALIZE_RULE() */
/* get the rule's cover and consequent from the data */
/* create_rule(DB_ptr, &Domain_Info, &Test_Rule); */
specialize_rule(DB_ptr, &Domain_Info, &Test_Rule, i);
/*****************************************************/
/* call EVALUATION_FUNCTION() */
/* get the quality of this rule */
if (Test_Rule.cover >= min_cover) {
eval_fn(&Test_Rule, &Max_Rule, Domain_Info,
eval_function) ;
}
else {
Test_Rule.value = 0 ;
}
/* is this test rule better than the current best test */
if (Max_Test_Rule.value < Test_Rule.value) {
if (debug) fprintf(stderr,
"NEW WINS *** %f < %f\tcover %d\n",
Max_Test_Rule.value, Test_Rule.value,
Test_Rule.cover);
clear_rule(Max_Test_Rule.db_priv) ;
bcopy(&Test_Rule, &Max_Test_Rule,
sizeof(rule_type) ) ;
}
else {
clear_rule(Test_Rule.db_priv) ;
if (debug) fprintf(stderr,
"old max_test stays %f < %f\tcover %d\n",
Max_Test_Rule.value, Test_Rule.value,
Test_Rule.cover);
}
}
else { /* may want to find out why this term was rejected */
if (debug) fprintf(stderr,
"debug: addition of term [%d] to rule [%s] FAILED.\n",
i, Max_Rule.Rule) ;
}
} /* go back and try any other potential term to specialize on */
if(debug)fprintf(stderr,"debug: end round of specialization\n");
/* Have we really specialized any further? */
if (Max_Test_Rule.value > min_measure) {
clear_rule(Max_Rule.db_priv);
/* update the max rule to be the winning test rule*/
bcopy(&Max_Test_Rule, &Max_Rule, sizeof(rule_type) ) ;
bzero(&Max_Test_Rule, sizeof(rule_type)) ;
Max_Test_Rule.value = 1;
if (debug) {
fprintf(stderr,
"NEW BEST RULE: COVER: %d\tValue %f\t\n",
Max_Rule.cover, Max_Rule.value) ;
print_rule(output_file, Domain_Info, &Max_Rule);
test_rule(output_file, &Domain_Info, &Max_Rule,
Domain_Info.Datadict[Domain_Info.resp_attr].members[(int)event_vector[Domain_Info.resp_attr].nominal],
test_accuracy);
}
}
} /* while specializations are possible) */
if(debug)fprintf(stderr,"debug: Max Rule found - cover %d !> %d\n",
Max_Rule.cover, Max_Test_Rule.cover) ;
} /* end of predictions section */
/***********************************************************/
/** PROCESS THE PREDICTED (OR SELECTED) RULE AS REQUESTED **/
/***********************************************************/
if(verbose)
print_rule(output_file, Domain_Info, &Max_Rule);
/* Report whether this prediction was correct to the appropriate degree
given that the correct value has been provided in the event vector */
if(test_accuracy && event_vector[Domain_Info.resp_attr].nominal!=NOMINAL_NA ) {
int resp_attr=Domain_Info.resp_attr ;
int member ;
member = event_vector[resp_attr].nominal ;
if (debug) fprintf(stderr, "%d\t%d\t[%s]\n", resp_attr, member,
Domain_Info.Datadict[resp_attr].members[member]) ;
test_rule(output_file, &Domain_Info, &Max_Rule,
Domain_Info.Datadict[resp_attr].members[member],
test_accuracy);
}
/* release this rule */
clear_rule(Max_Rule.db_priv) ;
/* prepare for the next test event */
fgets(test_string, STRING, test_file);
} /* while there are more test events*/
/***********************************************/
/** FINISH **/
/***********************************************/
if (debug) fprintf(stderr, "debug: about to FINISH\n") ;
db_exit(DB_ptr) ;
if (output_file != stdout) fclose(output_file) ;
exit(0) ;
} /* EXIT main() */
/*****************************************************************
******************************************************************
** SUPPORT FUNCTIONS **
******************************************************************
*****************************************************************/
/*****************************************************************************
** Class Separation Distance Measure
** ORT measure proposed in Fayyad,Irani[AAAI-92]
** This is the cosine of the angle between the two distributions
*****************************************************************************/
int eval_fn(
rule_type *TR , /* Test Rule */
rule_type *MR , /* Max Rule */
domain_type DI , /* Domain Info */
int measure
)
{
int i ;
int classes = DI.Datadict[DI.resp_attr].size;
double TR_cover=TR->cover ;
double MR_cover=MR->cover ;
double TRC_cover ;
double TRC_ratio=0 ;
double TR_ratio ; /* store into variable for efficiency */
double MR_ratio ; /* store into variable for efficiency */
double sum_abab = 0 ; /* Sum along dimensions of squares of (TR*TRC)'s */
double sum_ab2 = 0 ; /* Sum along dimensions of squares of (TR-TRC)'s */
double sum_a2 = 0 ; /* Sum along dimensions of squares of TR's */
double sum_b2 = 0 ; /* Sum along dimensions of squares of TRC's */
/* for entropy calc */
/* double log10_2 = log10(2) ; hardcode to calc base2 logs */
double entropy_p = 0 ;
double entropy_l = 0 ;
double entropy_r = 0 ;
double entr_norm = 0 ; /* entropy normalization quotient */
/* does this rule (or its complement) have a cover? */
if ( (TR_cover==0) || (MR_cover-TR_cover==0) ) {
TR->value = (float)0 ;
if (debug) fprintf(stderr, "debug: Everything matched!\n") ;
return(0) ;
}
/* get the normalization quotient for this entropy measure version */
if (measure==6) {
/* foreach class ratio */
for (i=0; i<classes; i++) {
/* if the ratio is zero then add zero (nothing) */
if (TR->conseq_ratios[i] > 0.00001)
entr_norm += TR->conseq_ratios[i]/MR->conseq_ratios[i] ;
}
}
/* foreach class value accumulate the differences */
for (i=0; i<classes; i++) {
TR_ratio = TR->conseq_ratios[i] ; /* make local for efficiency */
MR_ratio = MR->conseq_ratios[i] ; /* make local for efficiency */
/* Set the TRC_ratio based on the parents and sister's distributions*/
TRC_cover = MR_ratio*MR_cover - TR_ratio*TR_cover ;
TRC_ratio = TRC_cover / (MR_cover - TR_cover) ;
if (debug) fprintf(stderr, "%f %f %f %f %f %f\n",
MR->conseq_ratios[i], MR->conseq_ratios[i]*MR->cover,
TR_ratio, TR_ratio*TR_cover,
TRC_ratio, TRC_cover ) ;
/* for Cos() */
if (measure==1) {
sum_abab += ( TR_ratio * TRC_ratio ) ;
sum_a2 += ( TR_ratio * TR_ratio ) ;
sum_b2 += ( TRC_ratio * TRC_ratio ) ;
}
/* for Cos2() */
if (measure==4) {
sum_abab += ( TR_ratio * MR_ratio ) ;
sum_a2 += ( TR_ratio * TR_ratio ) ;
sum_b2 += ( MR_ratio * MR_ratio ) ;
}
/* for Distance() */
if (measure==2) {
sum_ab2 += (TR_ratio - TRC_ratio)*(TR_ratio - TRC_ratio);
/* fprintf(stderr,
"%g <-+ (%g - %g)^2\n", sum_ab2, TR_ratio, TRC_ratio);
*/
}
/* for Distance2() */
if (measure==5) {
sum_ab2 += (TR_ratio - MR_ratio)*(TR_ratio - MR_ratio);
}
/* for Entropy() */
if (measure==3) { /* test for small so that 0*log(0) = 0 */
if (MR_ratio > 0.00001)
entropy_p -= MR->conseq_ratios[i] * log10(MR->conseq_ratios[i])
/ LOG10_2 ;
if (TR_ratio > 0.00001)
entropy_r -= TR_ratio * log10(TR_ratio)/LOG10_2 ;
if (TRC_ratio > 0.00001)
entropy_l -= TRC_ratio * log10(TRC_ratio)/LOG10_2 ;
/* fprintf(stderr, "_p %g , _r %g, _l %g\n",
entropy_p, entropy_r, entropy_l); */
}
/* for Entropy() */
if (measure==6) { /* test for small so that 0*log(0) = 0 */
double norm_ratio ;
if (MR_ratio >= 0) { /* i.e. always */
/* normalized parent's ratio = 1/classes by definition */
norm_ratio = 1.0/classes ;
entropy_p -= norm_ratio * log10(norm_ratio)/LOG10_2 ;
}
if (TR_ratio > 0.00001) {
norm_ratio = (TR_ratio/MR_ratio) / entr_norm ;
entropy_r -= norm_ratio * log10(norm_ratio)/LOG10_2 ;
}
}
/* printf("") ; */
}
/** Cos() based i.e. ORT */
if ((measure==1) || (measure==4)) {
/** calculate the cosine of the angle between the two distr. vectors */
double cos_angle = sum_abab / (sqrt(sum_a2)*sqrt(sum_b2));
TR->value = 1 - cos_angle ;
if (debug) fprintf(stderr, "%g <- 1 - (%g / (sqrt(%g) * sqrt(%g)))\n",
TR->value, sum_abab, sum_a2, sum_b2 ) ;
}
/** Euclidean distance() based */
if ((measure == 2) || (measure == 5)) {
/** calc distance between the two vectors */
/** normalize [0,sqrt(2)] to [0,1] */
TR->value = sqrt(sum_ab2)/SQRT_2 ;
if (debug) fprintf(stderr, "%f <- sqrt(%g)/%g\n",
TR->value, sum_ab2, sqrt(2) ) ;
}
/** Entropy() based */
if (measure == 3) {
/* probability of each path */
double p_l = (MR_cover - TR_cover)/MR_cover ;
double p_r = (TR_cover)/MR_cover ;
/** parents entropy minus the portion of the children's entropy */
TR->value = entropy_p - p_l*entropy_l - p_r*entropy_r ;
if (debug) fprintf(stderr, "%f <- %f - %g*%g - %g*%g\n",
TR->value, entropy_p, p_l, entropy_l, p_r, entropy_r ) ;
}
/** Entropy2() based */
if (measure == 6) {
/** parents entropy minus the the child's entropy */
TR->value = entropy_p - entropy_r ;
/* if (debug) fprintf(stderr, "%g <- %g - %g\n",
TR->value, entropy_p, entropy_r ) ; */
}
if(debug) fprintf(stderr, "heuristic->%f\n", TR->value) ;
return (0) ;
}
/*****************************************************************************
** print a report of the rule consequent
*****************************************************************************/
void print_rule(
FILE *fd ,
domain_type DI , /* Domain_Info */
rule_type *Rule
)
{
struct test TEST[MAX_ATTRIBUTES]; /* Ratios repository to be sorted */
int classes = DI.Datadict[DI.resp_attr].size ;
char Quantity[13] ;
char Ratio[13] ;
int i ;
if (debug) fprintf(stderr,"debug: enter print_rule()\n") ;
/* prepare the test table */
bzero(&TEST, sizeof(TEST)) ;
for(i=0; i<classes; i++) {
TEST[i].index = i ;
TEST[i].ratio = Rule->conseq_ratios[i] ;
}
/* qsort into descending order */
qsort (&TEST, classes, sizeof(struct test), ratio_compare) ;
/* print the banner */
if (Rule->Rule != NULL)
fprintf(fd, "BECAUSE\n%s\n", Rule->Rule );
else
fprintf(fd, "%s\n", NULL_RULE_DESC) ;
fprintf(fd, "PREDICT\n");
fprintf(fd, "%-25s\t%-13s\t%-13s\n",
DI.Datadict[DI.resp_attr].name,
"Quantity", "Ratio" );
fprintf(fd, "%-25s\t%-13s\t%-13s\n", "-------------------------", "-------------", "-------------" );
/* print out the consequent */
for (i=0; i<classes; i++) {
/* get the quantity and ratio for this class */
sprintf(Quantity, "%d", /* round this number */
(int)floor(0.5 + (Rule->cover * TEST[i].ratio) ) ) ;
sprintf(Ratio, "%f", TEST[i].ratio ) ;
fprintf(fd, "%-25s\t%-13s\t%-13s\n",
DI.Datadict[DI.resp_attr].members[TEST[i].index],
Quantity, Ratio ) ;
}
}
/*****************************************************************************
** print the correctness of the rule consequent
** BUG: What to return when equally probable predictions arise (e.g. 50:50:0)?
*****************************************************************************/
void test_rule(
FILE *fd ,
domain_type *DI ,
rule_type *Rule ,
char *True_Response ,
int within /* the placement of the correct class in */
/* the consequent must be within this value. */
)
{
struct test TEST[MAX_ATTRIBUTES]; /* Ratios repository to be sorted */
int found_at=MAX_ATTRIBUTES+1 ;
int i ;
char result[TOKEN] ;
int classes = DI->Datadict[DI->resp_attr].size ;
if (debug) fprintf(stderr,"debug: enter test_rule()\n") ;
/* prepare the test table */
bzero(&TEST, sizeof(TEST)) ;
for(i=0; i<classes; i++) {
TEST[i].index = i ;
TEST[i].ratio = Rule->conseq_ratios[i] ;
}
/* qsort into descending order */
qsort (&TEST, classes, sizeof(struct test), ratio_compare) ;
/* process the result */
for(i=0; i<classes; i++) {
if (debug) fprintf(stderr,"debug: [%s]^[%s]",
DI->Datadict[DI->resp_attr].members[TEST[i].index],
True_Response) ; /* BUG */
if ( !strcmp(DI->Datadict[DI->resp_attr].members[TEST[i].index],
True_Response) ) {
found_at=i ;
if (debug) fprintf(stderr, "\tcorrect match\n" ) ;
}
else
if (debug) fprintf(stderr, "\tdon't match\n" ) ;
}
if (found_at<within) strcpy(result, CORRECT_STRING) ;
else strcpy(result, WRONG_STRING) ;
if (verbose) {
fprintf(fd, "\n%8s %20s %8s %20s\n",
"Result", "Antecedent Size", "Cover", "True Response");
fprintf(fd, "%8s %20s %8s %20s\n", "--------",
"--------------------", "--------", "--------------------") ;
fprintf(fd, "%8s %20d %8d %20s\n", result, Rule->a_size, Rule->cover,
True_Response ) ;
}
else {
time_2=time(NULL) ;
fprintf(fd, "%s\t%d\t%d\t%d\n", result, Rule->a_size, Rule->cover, (int)(time_2-time_1)) ;
time_1=time_2 ;
}
fflush(fd) ;
}
/*****************************************************************************
** used as the sorting function in qsort()
*****************************************************************************/
static int ratio_compare(
const void *i_ ,
const void *j_
)
{
struct test *i=(struct test *)i_ ;
struct test *j=(struct test *)j_ ;
int value ;
value = 9999999 * (j->ratio - i->ratio);
if (debug) fprintf(stderr, "debug: %f\t%f -> %d\n",
j->ratio, i->ratio, value) ;
return (value);
}
static void set_rule_description (
rule_type *Rule,
domain_type DI
)
{
int i;
char term[TOKEN];
int first=0;
for(i=0; i<DI.attributes; i++) {
if (Rule->attribute_map[i]) {
char *attribute=DI.Datadict[i].name ;
if (first++) /* if first then clean out o/wise slap an AND */
strcat(Rule->Rule, " AND ") ;
else
strcpy(Rule->Rule, "") ;
if (DI.Datadict[i].type == NOMINAL) {
#if DBMS==1
/* SQL FORM */
sprintf(term, "%s=[%s]", attribute,
DI.Datadict[i].members[(int)Rule->Terms[i].nominal]) ;
#else
sprintf(term, "%s=\"%s\"", attribute,
DI.Datadict[i].members[(int)Rule->Terms[i].nominal]) ;
#endif
}
else {
#if DBMS==1
sprintf(term, "%s IN [%g, %g]" ,
attribute , Rule->Terms[i].min_max[MIN] ,
Rule->Terms[i].min_max[MAX] ) ;
#else
/* SQL FORM */
sprintf(term, "%s>=%g AND %s<=%g" ,
attribute , Rule->Terms[i].min_max[MIN] ,
attribute , Rule->Terms[i].min_max[MAX] ) ;
#endif
}
strcat(Rule->Rule, term) ;
}
}
}