IBM Watson Studio provides a useful platform for running Machine Learning and developing Business Intelligence. Cloud resources can be accessed virtually for free and the lite version of same may prove a fruitful exemplar of tools now available to small businesses wanting to harness the cloud remotely. Rather than owning their own computing infrastructure or data centres, micro-entrepreneurs can rent access to a cloud service provider. This option may also save money but of course does not have the allure of being free. One advantage of using cloud computing services is that firms can avoid the upfront cost and responsibility of owning and maintaining their own IT infrastructure, and just simply pay for what they use, as and when they use it.
Resources like this are increasingly being relied upon by small business to leverage economies of scale that can be summoned to your computer, tablet or phone. The following resources and tools are available for free to explore with AI and machine learning functionality within the Watson Studio :
50 capacity unit hours/month
Integrated environments
Publish and collaborate in the cloud
Notebook servers and R Studio for interactivity and data visualization with Python, R, and Scala
Small business are typically resource-stretched actors and so would benefit by being able to make use of data tools normally exclusive to the big players. Of course, free sometimes means your data is the the commodity being sold on to third parties. Machine Learning and Artificial Intelligence are increasingly relied upon by business owners. Below, we explore a limited number of cloud resources available within IBM Watson Studio and develop more the exploratory data analysis of the HDMA dataset replete with graphing, data query and pivot tabling. R Tidyverse code is provided underneath the video clips - so implementation is kept tractable and straight-forward. The video clips do not provide an exhaustive detailed account of IBM Watson Studio resources. These are merely intended to show case a limited but nontrivial aspect of Cloud functionality that interested parties might explore further..
####################################################
# HMDA Boston tiyverse
# Exploratory Data Analysis
# Dataset Described in
# http://pubs.aeaweb.org/doi/pdfplus/10.1257/jep.28.2.3
# Hal R. Varian
###################################################
library(Ecdat)
library(tidyverse)
library(party)
data(Hdma)
# fix annoying spelling error
names(Hdma)[11] <- "condo"
# dir: debt payments to total income ratio;
# hir: housing expenses to income ratio;
# lvr: ratio of size of loan to assessed value of property;
# ccs: consumer credit score;
# mcs: mortgage credit score;
# pbcr: public bad credit record;
# dmi: denied mortgage insurance;
# self: self employed;
# single: applicant is single;
# uria: 1989 Massachusetts unemployment rate applicant's industry;
# condominiom: condominium;
# black: race of applicant black;
# deny: mortgage application denied;
# inspect the data
head(Hdma)
summary(Hdma)
str(Hdma)
view(Hdma)
# Proportions approved - no = approved
ggplot(Hdma, aes(x = deny)) +
theme_bw() +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial, no implies approved")
# Numbers with different ccs (the lower the better)
ggplot(Hdma, aes(x = ccs)) +
theme_bw() +
geom_bar() +
labs(y = "score",
title = "consumer credit score")
# Numbers with different mcs (the lower the better)
ggplot(Hdma, aes(x = mcs)) +
theme_bw() +
geom_bar() +
labs(y = "score",
title = "mortgage credit score")
# Parallelization of graphs
ggplot(Hdma, aes(x = deny)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs")
################################
# all
################################
# exclude incomplete entries
all <- Hdma[complete.cases(Hdma),]
# Parallelization of graphs
ggplot(all, aes(x = deny)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs")
# Deny relative to ccs and pbcr
ggplot(all, aes(x = deny, fill = pbcr)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs and pbcr")
# Deny relative to ccs and dmi
ggplot(all, aes(x = deny, fill = dmi)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs and dmi")
# breakdown of employed and self employed
ggplot(all, aes(x = self)) +
theme_bw() +
geom_bar() +
labs(y = "Self Employed",
title = "Self Employed")
# examining mortgage approval in relation to employed and self employed status
ggplot(all, aes(x = deny, fill = self)) +
theme_bw() +
facet_wrap(~ ccs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying ccs and self-employed")
# Deny relative to mcs and dmi
ggplot(all, aes(x = deny, fill = dmi)) +
theme_bw() +
facet_wrap(~ mcs) +
geom_bar() +
labs(y = "Mortgage Deny Count",
title = "Mortgage denial for varying mcs and dmi")
# setting out a histogram for lvr
ggplot(all, aes(x = lvr)) +
theme_bw() +
geom_histogram(binwidth = 0.1) +
labs(y = "number of mortgage application in lvr band",
x = "lvr (binwidth = 0.05)",
title = "lvr Distribtion")
# exploring lvr and likely effects on mortgage approval
ggplot(all, aes(x = lvr, fill = deny)) +
theme_bw() +
geom_histogram(binwidth = 0.1) +
labs(y = "number of mortgage application in lvr band",
x = "lvr (binwidth = 0.1)",
title = "lvr Distribtion")
############################################################
logit.fitlvr <- glm(deny ~ lvr,data=all,family="binomial")
summary(logit.fitlvr)
logit.fit <- glm(deny ~ .,data=all,family="binomial")
summary(logit.fit)
###########################################################
# exploring the relationship between dir and hir
ggplot(all, aes(x = dir, y = hir)) +
geom_point()
# exploring the relationship between dir and hir
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = ccs))
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = deny))
all %>%
filter(dir < 1, hir < 1) %>%
ggplot() +
geom_point(mapping = aes(x = dir, y = hir, color = ccs))
all %>%
filter(dir < 1, hir < 1) %>%
ggplot() +
geom_point(mapping = aes(x = dir, y = hir, color = deny))
# exploring the relationship between dir and hir
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = ccs)) +
facet_wrap(~ ccs, nrow = 2)
# exploring the relationship between dir and hir
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = deny)) +
facet_wrap(~ ccs, nrow = 2)
# exploring the relationship between dir and hir
ggplot(data = all) +
geom_point(mapping = aes(x = dir, y = hir, color = deny)) +
facet_wrap(~ ccs, nrow = 2) +
geom_smooth(mapping = aes(x = dir, y = hir))
# exploring the relationship between dir and hir for dir < 2
all %>%
filter(dir < 1) %>%
ggplot() +
geom_point(mapping = aes(x = dir, y = hir, color = deny)) +
facet_wrap(~ ccs, nrow = 2) #+
# geom_smooth(mapping = aes(x = dir, y = hir))
cor(all$dir,all$hir)
all %>%
filter(dir < 1) %>%
ggplot() +
geom_point(mapping = aes(x = lvr, y = dir, color = deny)) +
facet_wrap(~ ccs, nrow = 2) +
geom_smooth(mapping = aes(x = lvr, y = dir))
all %>%
filter(dir < 1, lvr < 1) %>%
ggplot() +
geom_point(mapping = aes(x = lvr, y = dir, color = deny)) +
facet_wrap(~ ccs, nrow = 2) +
geom_smooth(mapping = aes(x = lvr, y = dir))
cor(all$dir,all$lvr)
ggplot(data = all) +
geom_boxplot(mapping = aes(x = deny, y = dir))
ggplot(data = all) +
geom_boxplot(mapping = aes(x = deny, y = dir)) +
facet_wrap(~ ccs, nrow = 2)
all %>%
filter(dir < 2) %>%
ggplot() +
geom_boxplot(mapping = aes(x = deny, y = dir)) +
facet_wrap(~ ccs, nrow = 2)
all %>%
filter(lvr < 2) %>%
ggplot() +
geom_boxplot(mapping = aes(x = deny, y = lvr)) +
facet_wrap(~ ccs, nrow = 2)
all %>%
filter(lvr < 2) %>%
ggplot() +
geom_boxplot(mapping = aes(x = deny, y = lvr, color = self)) +
facet_wrap(~ ccs, nrow = 2)
#lvr boxplot for african american reltaive to rest of population
all %>%
filter(lvr < 2) %>%
ggplot() +
geom_boxplot(mapping = aes(x = deny, y = lvr, color = black)) +
facet_wrap(~ ccs, nrow = 2)
# The following pivot tables provide another tool aggregating and summarising relationships in data
pivot1 <- all %>%
group_by(deny) %>%
summarize(Medianlvr = median(lvr, na.rm=TRUE),
count = n()) %>%
arrange(deny)
View(pivot1)
pivot2 <- all %>%
group_by(deny, dmi) %>%
summarize(Medianlvr = median(lvr, na.rm=TRUE),
count = n()) %>%
arrange(deny, dmi)
View(pivot2)
all.fit <- ctree(deny ~ .,data=all)
# Figure 5 in paper
#pdf("all.pdf",height=8,width=16)
plot(all.fit)
graphics.off()
pivot3 <- all %>%
group_by( deny, ccs, mcs) %>%
summarize(meandir = mean(dir, na.rm=TRUE),
count = n()) %>%
arrange(deny, ccs, mcs)
View(pivot3)