IBM Watson Studio provides a useful platform for running Machine Learning and the lite version may prove particularly fruitful to small businesses wanting to become smart. The following resources and tools are available for free to explore collaboratively data with AI and machine learning in the Watson Studio Cloud - Lite Plan :
Below I will run through a number of cloud resources available in the IBM Watson Studio and set out a more thorough exploratory data analysis of the HDMA dataset using R Tidyverse. R Tidyverse code is provided below the video clips.
####################################################
# HMDA Boston tiyverse
# Exploratory Data Analysis
# Dataset Described in
# http://pubs.aeaweb.org/doi/pdfplus/10.1257/jep.28.2.3
# Hal R. Varian
###################################################
library(Ecdat)
library(tidyverse)
library(party)
data(Hdma)
# fix annoying spelling error
names(Hdma)[11] <- "condo"
# dir: debt payments to total income ratio;
# hir: housing expenses to income ratio;
# lvr: ratio of size of loan to assessed value of property;
# ccs: consumer credit score;
# mcs: mortgage credit score;
# pbcr: public bad credit record;
# dmi: denied mortgage insurance;
# self: self employed;
# single: applicant is single;
# uria: 1989 Massachusetts unemployment rate applicant's industry;
# condominiom: condominium;
# black: race of applicant black;
# deny: mortgage application denied;
# inspect the data
head(Hdma)
summary(Hdma)
str(Hdma)
view(Hdma)
# Proportions approved - no = approved
ggplot(Hdma, aes(x = deny)) +
theme_bw() +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial, no implies approved")
# Numbers with different ccs (the lower the better)
ggplot(Hdma, aes(x = ccs)) +
theme_bw() +
geom_bar() +
labs(y = "score",
title = "consumer credit score")
# Numbers with different mcs (the lower the better)
ggplot(Hdma, aes(x = mcs)) +
theme_bw() +
geom_bar() +
labs(y = "score",
title = "mortgage credit score")
# Parallelization of graphs
ggplot(Hdma, aes(x = deny)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs")
################################
# all
################################
# exclude incomplete entries
all <- Hdma[complete.cases(Hdma),]
# Parallelization of graphs
ggplot(all, aes(x = deny)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs")
# Deny relative to ccs and pbcr
ggplot(all, aes(x = deny, fill = pbcr)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs and pbcr")
# Deny relative to ccs and dmi
ggplot(all, aes(x = deny, fill = dmi)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs and dmi")
# breakdown of employed and self employed
ggplot(all, aes(x = self)) +
theme_bw() +
geom_bar() +
labs(y = "Self Employed",
title = "Self Employed")
# examining mortgage approval in relation to employed and self employed status
ggplot(all, aes(x = deny, fill = self)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs and self-employed")
# Deny relative to mcs and dmi
ggplot(all, aes(x = deny, fill = dmi)) +
theme_bw() +
facet_wrap(~ mcs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying mcs and dmi")
# setting out a histogram for lvr
ggplot(all, aes(x = lvr)) +
theme_bw() +
geom_histogram(binwidth = 0.1) +
labs(y = "number of mortgage application in lvr band",
x = "lvr (binwidth = 0.05)",
title = "lvr Distribtion")
# exploring lvr and likely effects on mortgage approval
ggplot(all, aes(x = lvr, fill = deny)) +
theme_bw() +
geom_histogram(binwidth = 0.1) +
labs(y = "number of mortgage application in lvr band",
x = "lvr (binwidth = 0.1)",
title = "lvr Distribtion")
############################################################
logit.fitlvr <- glm(deny ~ lvr,data=all,family="binomial")
summary(logit.fitlvr)
logit.fit <- glm(deny ~ .,data=all,family="binomial")
summary(logit.fit)
###########################################################
# exploring the relationship between dir and hir
ggplot(all, aes(x = dir, y = hir)) +
geom_point()
# exploring the relationship between dir and hir
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = ccs))
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = deny))
all %>%
filter(dir < 1, hir < 1) %>%
ggplot() +
geom_point(mapping = aes(x = dir, y = hir, color = ccs))
all %>%
filter(dir < 1, hir < 1) %>%
ggplot() +
geom_point(mapping = aes(x = dir, y = hir, color = deny))
# exploring the relationship between dir and hir
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = ccs)) +
facet_wrap(~ ccs, nrow = 2)
# exploring the relationship between dir and hir
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = deny)) +
facet_wrap(~ ccs, nrow = 2)
# exploring the relationship between dir and hir
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = deny)) +
facet_wrap(~ ccs, nrow = 2) +
geom_smooth(mapping = aes(x = dir, y = hir))
# exploring the relationship between dir and hir for dir < 2
all %>%
filter(dir < 1) %>%
ggplot() +
geom_point(mapping = aes(x = dir, y = hir, color = deny)) +
facet_wrap(~ ccs, nrow = 2) #+
# geom_smooth(mapping = aes(x = dir, y = hir))
cor(all$dir,all$hir)
all %>%
filter(dir < 1) %>%
ggplot() +
geom_point(mapping = aes(x = lvr, y = dir, color = deny)) +
facet_wrap(~ ccs, nrow = 2) +
geom_smooth(mapping = aes(x = lvr, y = dir))
all %>%
filter(dir < 1, lvr < 1) %>%
ggplot() +
geom_point(mapping = aes(x = lvr, y = dir, color = deny)) +
facet_wrap(~ ccs, nrow = 2) +
geom_smooth(mapping = aes(x = lvr, y = dir))
cor(all$dir,all$lvr)
ggplot(data = all) +
geom_boxplot(mapping = aes(x = deny, y = dir))
ggplot(data = all) +
geom_boxplot(mapping = aes(x = deny, y = dir)) +
facet_wrap(~ ccs, nrow = 2)
all %>%
filter(dir < 2) %>%
ggplot() +
geom_boxplot(mapping = aes(x = deny, y = dir)) +
facet_wrap(~ ccs, nrow = 2)
all %>%
filter(lvr < 2) %>%
ggplot() +
geom_boxplot(mapping = aes(x = deny, y = lvr)) +
facet_wrap(~ ccs, nrow = 2)
all %>%
filter(lvr < 2) %>%
ggplot() +
geom_boxplot(mapping = aes(x = deny, y = lvr, color = self)) +
facet_wrap(~ ccs, nrow = 2)
#lvr boxplot for african american reltaive to rest of population
all %>%
filter(lvr < 2) %>%
ggplot() +
geom_boxplot(mapping = aes(x = deny, y = lvr, color = black)) +
facet_wrap(~ ccs, nrow = 2)
# The following pivot tables provide another tool aggregating and summarising relationships in data
pivot1 <- all %>%
group_by(deny) %>%
summarize(Medianlvr = median(lvr, na.rm=TRUE),
count = n()) %>%
arrange(deny)
View(pivot1)
pivot2 <- all %>%
group_by(deny, dmi) %>%
summarize(Medianlvr = median(lvr, na.rm=TRUE),
count = n()) %>%
arrange(deny, dmi)
View(pivot2)
all.fit <- ctree(deny ~ .,data=all)
# Figure 5 in paper
#pdf("all.pdf",height=8,width=16)
plot(all.fit)
graphics.off()
pivot3 <- all %>%
group_by( deny, ccs, mcs) %>%
summarize(meandir = mean(dir, na.rm=TRUE),
count = n()) %>%
arrange(deny, ccs, mcs)
View(pivot3)