Weight of Evidence (WoE) Transformation

Post date: May 19, 2014 8:29:08 PM

Here is the r code demonstrating how to use WoE. There are two parts: 1) Transform data into WoE space and 2) use the transformed features to build model.

Requiring package "func_woe_transformation.r" [download].

Part 1: Using Weight of Evidence

This code shows how to use WoE package to transform features into WoE space.

setwd('/Users/kittipat/research/code_library/titanic/')   ## ===== Import the training dataset ===== csv.input <- 'data_train.csv' sampleData <- read.csv(csv.input, header = TRUE, nrows = 300) classes <- sapply(sampleData, class) df <- read.delim(file=csv.input,                   sep = ",",                  stringsAsFactors = F,                  header = TRUE,                   na.strings = c("","---","9999"),                  colClasses = classes)   names(df) <- tolower(names(df))  ## -- make train/validation dataset -- num.train <- ceiling(0.6*nrow(df)) id.train <- sample.int(n=nrow(df), size=num.train, replace=F) id.validate <- setdiff(x=1:nrow(df), y=id.train) df.train <- df[id.train,] df.validate <- df[id.validate,]     ## ---------------------------------------------- ## ======== transform the feature to WoE ========  ## ---------------------------------------------- source('~/research/code_library/bot_function_repository/func_woe_transformation.r')   ## ----- make variable report ----- y=df.train$survived capture.output(reportVariablesProfile(df=df.train, y=y),                 file='woe_table.txt')    ## ----- user can manually select variables from suggested one here ----- var.report <- read.delim(file='variables_report_full.csv', header=T, sep=',', stringsAsFactors = F) var.name.recommended <- var.report$var.name[var.report$recommended==1]   ## ----------------------------------- ## ===== convert variable to WoE ===== ## -----------------------------------   #var.name.list <- names(df.train) ## ----- use all variables var.name.list <- var.name.recommended ## ----- use all variables   wx.train <- NULL wx.test <- NULL   for (i in var.name.list) {   x.train <- df.train[,i]   x.test <- df.validate[,i]   internal.type <- func_cast_to_main_class(x.train, 3)     if ( internal.type %in% c('character') ) {     ## --- create WoE transformation table      woe.table <- func_binning_categ(x=x.train, y=y, alpha=100)     bin_name <- woe.table$bin.table$bin_name     bin_woe <- woe.table$bin.table$woe     out.woe.train <- func_convert_to_woe_categ(x.train, bin_name, bin_woe, 0)     out.woe.test <- func_convert_to_woe_categ(x.test, bin_name, bin_woe, 0)     } else if ( internal.type %in% c('numeric') ) {     ## --- create WoE transformation table      woe.table <- func_binning_numeric(x=x.train, y=y, alpha=100, p.value.thr=0.4)       break_points <- woe.table$breakPoints     bin_name <- woe.table$bin.table$bin_name ## unused here     bin_woe <- woe.table$bin.table$woe     out.woe.train <- func_convert_to_woe_numeric(x.train, break_points, bin_woe, 0)     out.woe.test <- func_convert_to_woe_numeric(x.test, break_points, bin_woe, 0)     }     wx.train[[i]] <- out.woe.train$woe   wx.test[[i]] <- out.woe.test$woe }       ## ----- convert list to data frame ----- L <- wx.train wx <- data.frame(matrix(unlist(L), nrow=length(L[[1]]), byrow=F)) names(wx) <- names(L) wx.train <- wx   L <- wx.test wx <- data.frame(matrix(unlist(L), nrow=length(L[[1]]), byrow=F)) names(wx) <- names(L) wx.test <- wx     ## ----- organize and save the dataset ----- names(df.test) wx.test[1:20,] wx.train[,c('passengerid', 'survived')] <- df.train[,c('passengerid', 'survived')] wx.test[,c('passengerid', 'survived')] <- df.validate[,c('passengerid', 'survived')]   save(wx.train, file='data_train.RData') save(wx.test, file='data_validate.RData')   ## ----- Now we can use the dataset to train/test model -----
Created by Pretty R at inside-R.org

Part 2: Building a classifier

Building a logistic regression model (with elnet penalty function) on the WoE space.

## This program will build a model using logistic regression   require('glmnet')   load('data_train.RData') load('data_validate.RData') wx.train$ wx.validate <- wx.test   ## --- prepare training dataset x.train <- wx.train[, !( names(wx.train) %in% c('survived','passengerid')) ] x.train <- as.matrix(x.train) y.train <- wx.train$survived   ## --- prepare validation dataset x.validate <- wx.validate[, !( names(wx.validate) %in% c('survived','passengerid')) ] x.validate <- as.matrix(x.validate) y.validate <- wx.validate$survived   ## --- build a model fit2=glmnet(x.train, y.train,             family="binomial", alpha=0.9, nlambda=50) ## --- predict the outcome y.response <- predict(fit2, type="response", newx=x.validate) plot(fit2,pch=19)   ## --- evaluate the performance y.out <- y.response > 0.5 perf <- colSums(y.out==as.logical(y.validate)) plot(perf)
Created by Pretty R at inside-R.org