IBM Watson Studio and Exploratory Data Analysis

IBM Watson Studio

IBM Watson Studio provides a useful platform for running Machine Learning and the lite version may prove particularly fruitful to small businesses wanting to become smart. The following resources and tools are available for free to explore collaboratively data with AI and machine learning in the Watson Studio Cloud - Lite Plan :

50 capacity unit hours/month
Integrated environments
Publish and collaborate in the cloud
Notebook servers and R Studio for interactivity and data visualization with Python, R, and Scala

Below I will run through a number of cloud resources available in the IBM Watson Studio and set out a more thorough exploratory data analysis of the HDMA dataset using R Tidyverse. R Tidyverse code is provided below the video clips.

####################################################

# HMDA Boston tiyverse

# Exploratory Data Analysis

# Dataset Described in

# http://pubs.aeaweb.org/doi/pdfplus/10.1257/jep.28.2.3

# Hal R. Varian

###################################################

library(Ecdat)

library(tidyverse)

library(party)

data(Hdma)

# fix annoying spelling error

names(Hdma)[11] <- "condo"

# dir: debt payments to total income ratio;

# hir: housing expenses to income ratio;

# lvr: ratio of size of loan to assessed value of property;

# ccs: consumer credit score;

# mcs: mortgage credit score;

# pbcr: public bad credit record;

# dmi: denied mortgage insurance;

# self: self employed;

# single: applicant is single;

# uria: 1989 Massachusetts unemployment rate applicant's industry;

# condominiom: condominium;

# black: race of applicant black;

# deny: mortgage application denied;

# inspect the data

head(Hdma)

summary(Hdma)

str(Hdma)

view(Hdma)

# Proportions approved - no = approved

ggplot(Hdma, aes(x = deny)) +

  theme_bw() +

  geom_bar() +

  labs(y = "Mortgage Deny Count",

       title = "Mortgage denial, no implies approved")

# Numbers with different ccs (the lower the better)

ggplot(Hdma, aes(x = ccs)) +

  theme_bw() +

  geom_bar() +

  labs(y = "score",

       title = "consumer credit score")

# Numbers with different mcs (the lower the better)

ggplot(Hdma, aes(x = mcs)) +

  theme_bw() +

  geom_bar() +

  labs(y = "score",

       title = "mortgage credit score")

# Parallelization of graphs

ggplot(Hdma, aes(x = deny)) +

  theme_bw() +

  facet_wrap(~ ccs) +

  geom_bar() +

  labs(y = "Mortgage Deny Count",

       title = "Mortgage denial for varying ccs")

################################

# all

################################

# exclude incomplete entries

all <- Hdma[complete.cases(Hdma),]

# Parallelization of graphs

ggplot(all, aes(x = deny)) +

  theme_bw() +

  facet_wrap(~ ccs) +

  geom_bar() +

  labs(y = "Mortgage Deny Count",

       title = "Mortgage denial for varying ccs")

# Deny relative to ccs and pbcr

ggplot(all, aes(x = deny, fill = pbcr)) +

  theme_bw() +

  facet_wrap(~ ccs) +

  geom_bar() +

  labs(y = "Mortgage Deny Count",

       title = "Mortgage denial for varying ccs and pbcr")

# Deny relative to ccs and dmi

ggplot(all, aes(x = deny, fill = dmi)) +

  theme_bw() +

  facet_wrap(~ ccs) +

  geom_bar() +

  labs(y = "Mortgage Deny Count",

       title = "Mortgage denial for varying ccs and dmi")

# breakdown of employed and self employed

ggplot(all, aes(x = self)) +

  theme_bw() +

  geom_bar() +

  labs(y = "Self Employed",

       title = "Self Employed")

# examining mortgage approval in relation to employed and self employed status

ggplot(all, aes(x = deny, fill = self)) +

  theme_bw() +

  facet_wrap(~ ccs) +

  geom_bar() +

  labs(y = "Mortgage Deny Count",

       title = "Mortgage denial for varying ccs and self-employed")

# Deny relative to mcs and dmi

ggplot(all, aes(x = deny, fill = dmi)) +

  theme_bw() +

  facet_wrap(~ mcs) +

  geom_bar() +

  labs(y = "Mortgage Deny Count",

       title = "Mortgage denial for varying mcs and dmi")

# setting out a histogram for lvr

ggplot(all, aes(x = lvr)) +

  theme_bw() +

  geom_histogram(binwidth = 0.1) +

  labs(y = "number of mortgage application in lvr band",

       x = "lvr (binwidth = 0.05)",

       title = "lvr Distribtion")

# exploring lvr and likely effects on mortgage approval

ggplot(all, aes(x = lvr, fill = deny)) +

  theme_bw() +

  geom_histogram(binwidth = 0.1) +

  labs(y = "number of mortgage application in lvr band",

       x = "lvr (binwidth = 0.1)",

       title = "lvr Distribtion")

############################################################

logit.fitlvr <- glm(deny ~ lvr,data=all,family="binomial")

summary(logit.fitlvr)

logit.fit <- glm(deny ~ .,data=all,family="binomial")

summary(logit.fit)

###########################################################

# exploring the relationship between dir and hir

ggplot(all, aes(x = dir, y = hir)) +

  geom_point()

# exploring the relationship between dir and hir

ggplot(data = all) +

  geom_point(mapping = aes(x = dir, y = hir, color = ccs))

ggplot(data = all) +

  geom_point(mapping = aes(x = dir, y = hir, color = deny))

all %>%

  filter(dir < 1, hir < 1) %>%

  ggplot() +

  geom_point(mapping = aes(x = dir, y = hir, color = ccs))

all %>%

  filter(dir < 1, hir < 1) %>%

  ggplot() +

  geom_point(mapping = aes(x = dir, y = hir, color = deny))

# exploring the relationship between dir and hir

ggplot(data = all) +

  geom_point(mapping = aes(x = dir, y = hir, color = ccs)) +

  facet_wrap(~ ccs, nrow = 2)

# exploring the relationship between dir and hir

ggplot(data = all) +

  geom_point(mapping = aes(x = dir, y = hir, color = deny)) +

  facet_wrap(~ ccs, nrow = 2)

# exploring the relationship between dir and hir

ggplot(data = all) +

  geom_point(mapping = aes(x = dir, y = hir, color = deny)) +

  facet_wrap(~ ccs, nrow = 2) +

  geom_smooth(mapping = aes(x = dir, y = hir))

# exploring the relationship between dir and hir for dir < 2

all %>%

  filter(dir < 1) %>%

  ggplot() +

  geom_point(mapping = aes(x = dir, y = hir, color = deny)) +

  facet_wrap(~ ccs, nrow = 2) #+

#  geom_smooth(mapping = aes(x = dir, y = hir))

cor(all$dir,all$hir)

all %>%

  filter(dir < 1) %>%

  ggplot() +

  geom_point(mapping = aes(x = lvr, y = dir, color = deny)) +

  facet_wrap(~ ccs, nrow = 2) +

  geom_smooth(mapping = aes(x = lvr, y = dir))

all %>%

  filter(dir < 1, lvr < 1) %>%

  ggplot() +

  geom_point(mapping = aes(x = lvr, y = dir, color = deny)) +

  facet_wrap(~ ccs, nrow = 2) +

  geom_smooth(mapping = aes(x = lvr, y = dir))

cor(all$dir,all$lvr)

ggplot(data = all) +

  geom_boxplot(mapping = aes(x = deny, y = dir))

ggplot(data = all) +

  geom_boxplot(mapping = aes(x = deny, y = dir)) +

  facet_wrap(~ ccs, nrow = 2)

all %>%

  filter(dir < 2) %>%

  ggplot() +

  geom_boxplot(mapping = aes(x = deny, y = dir)) +

  facet_wrap(~ ccs, nrow = 2)

all %>%

  filter(lvr < 2) %>%

  ggplot() +

  geom_boxplot(mapping = aes(x = deny, y = lvr)) +

  facet_wrap(~ ccs, nrow = 2)

all %>%

  filter(lvr < 2) %>%

  ggplot() +

  geom_boxplot(mapping = aes(x = deny, y = lvr, color = self)) +

  facet_wrap(~ ccs, nrow = 2)

#lvr boxplot for african american reltaive to rest of population

all %>%

  filter(lvr < 2) %>%

  ggplot() +

  geom_boxplot(mapping = aes(x = deny, y = lvr, color = black)) +

  facet_wrap(~ ccs, nrow = 2)

# The following pivot tables provide another tool aggregating and summarising relationships in data

pivot1 <- all %>%

  group_by(deny) %>%

  summarize(Medianlvr = median(lvr, na.rm=TRUE),

            count = n()) %>%

  arrange(deny)

View(pivot1)

pivot2 <- all %>%

  group_by(deny, dmi) %>%

  summarize(Medianlvr = median(lvr, na.rm=TRUE),

            count = n()) %>%

  arrange(deny, dmi)

View(pivot2)

all.fit <- ctree(deny ~ .,data=all)

# Figure 5 in paper

#pdf("all.pdf",height=8,width=16)

plot(all.fit)

graphics.off()

pivot3 <- all %>%

  group_by( deny, ccs, mcs) %>%

  summarize(meandir = mean(dir, na.rm=TRUE),

            count = n()) %>%

  arrange(deny, ccs, mcs)

View(pivot3)

Page updated

Google Sites

Report abuse