IBM Watson Studio provides a useful platform for running Machine Learning and the lite version may prove particularly fruitful to small businesses wanting to become smart. The following resources and tools are available for free to explore collaboratively data with AI and machine learning in the Watson Studio Cloud - Lite Plan :
Below I will run through a number of cloud resources available in the IBM Watson Studio and set out a more thorough exploratory data analysis of the HDMA dataset using R Tidyverse. R Tidyverse code is provided below the video clips.
##################################################### HMDA Boston tiyverse# Exploratory Data Analysis# Dataset Described in# http://pubs.aeaweb.org/doi/pdfplus/10.1257/jep.28.2.3# Hal R. Varian###################################################library(Ecdat)library(tidyverse)library(party)data(Hdma)# fix annoying spelling errornames(Hdma)[11] <- "condo"# dir: debt payments to total income ratio;# hir: housing expenses to income ratio;# lvr: ratio of size of loan to assessed value of property;# ccs: consumer credit score;# mcs: mortgage credit score;# pbcr: public bad credit record; # dmi: denied mortgage insurance;# self: self employed;# single: applicant is single;# uria: 1989 Massachusetts unemployment rate applicant's industry;# condominiom: condominium;# black: race of applicant black;# deny: mortgage application denied;# inspect the datahead(Hdma)summary(Hdma)str(Hdma)view(Hdma)# Proportions approved - no = approvedggplot(Hdma, aes(x = deny)) + theme_bw() + geom_bar() + labs(y = "Mortgage Deny Count", title = "Mortgage denial, no implies approved")# Numbers with different ccs (the lower the better)ggplot(Hdma, aes(x = ccs)) + theme_bw() + geom_bar() + labs(y = "score", title = "consumer credit score")# Numbers with different mcs (the lower the better)ggplot(Hdma, aes(x = mcs)) + theme_bw() + geom_bar() + labs(y = "score", title = "mortgage credit score")# Parallelization of graphsggplot(Hdma, aes(x = deny)) + theme_bw() + facet_wrap(~ ccs) + geom_bar() + labs(y = "Mortgage Deny Count", title = "Mortgage denial for varying ccs")################################# all################################# exclude incomplete entriesall <- Hdma[complete.cases(Hdma),]# Parallelization of graphs ggplot(all, aes(x = deny)) + theme_bw() + facet_wrap(~ ccs) + geom_bar() + labs(y = "Mortgage Deny Count", title = "Mortgage denial for varying ccs")# Deny relative to ccs and pbcrggplot(all, aes(x = deny, fill = pbcr)) + theme_bw() + facet_wrap(~ ccs) + geom_bar() + labs(y = "Mortgage Deny Count", title = "Mortgage denial for varying ccs and pbcr")# Deny relative to ccs and dmiggplot(all, aes(x = deny, fill = dmi)) + theme_bw() + facet_wrap(~ ccs) + geom_bar() + labs(y = "Mortgage Deny Count", title = "Mortgage denial for varying ccs and dmi")# breakdown of employed and self employedggplot(all, aes(x = self)) + theme_bw() + geom_bar() + labs(y = "Self Employed", title = "Self Employed")# examining mortgage approval in relation to employed and self employed statusggplot(all, aes(x = deny, fill = self)) + theme_bw() + facet_wrap(~ ccs) + geom_bar() + labs(y = "Mortgage Deny Count", title = "Mortgage denial for varying ccs and self-employed")# Deny relative to mcs and dmiggplot(all, aes(x = deny, fill = dmi)) + theme_bw() + facet_wrap(~ mcs) + geom_bar() + labs(y = "Mortgage Deny Count", title = "Mortgage denial for varying mcs and dmi")# setting out a histogram for lvrggplot(all, aes(x = lvr)) + theme_bw() + geom_histogram(binwidth = 0.1) + labs(y = "number of mortgage application in lvr band", x = "lvr (binwidth = 0.05)", title = "lvr Distribtion")# exploring lvr and likely effects on mortgage approvalggplot(all, aes(x = lvr, fill = deny)) + theme_bw() + geom_histogram(binwidth = 0.1) + labs(y = "number of mortgage application in lvr band", x = "lvr (binwidth = 0.1)", title = "lvr Distribtion")############################################################logit.fitlvr <- glm(deny ~ lvr,data=all,family="binomial")summary(logit.fitlvr)logit.fit <- glm(deny ~ .,data=all,family="binomial")summary(logit.fit)############################################################ exploring the relationship between dir and hirggplot(all, aes(x = dir, y = hir)) + geom_point()# exploring the relationship between dir and hirggplot(data = all) + geom_point(mapping = aes(x = dir, y = hir, color = ccs))ggplot(data = all) + geom_point(mapping = aes(x = dir, y = hir, color = deny))all %>% filter(dir < 1, hir < 1) %>% ggplot() + geom_point(mapping = aes(x = dir, y = hir, color = ccs))all %>% filter(dir < 1, hir < 1) %>% ggplot() + geom_point(mapping = aes(x = dir, y = hir, color = deny))# exploring the relationship between dir and hirggplot(data = all) + geom_point(mapping = aes(x = dir, y = hir, color = ccs)) + facet_wrap(~ ccs, nrow = 2) # exploring the relationship between dir and hirggplot(data = all) + geom_point(mapping = aes(x = dir, y = hir, color = deny)) + facet_wrap(~ ccs, nrow = 2) # exploring the relationship between dir and hirggplot(data = all) + geom_point(mapping = aes(x = dir, y = hir, color = deny)) + facet_wrap(~ ccs, nrow = 2) + geom_smooth(mapping = aes(x = dir, y = hir))# exploring the relationship between dir and hir for dir < 2all %>% filter(dir < 1) %>% ggplot() + geom_point(mapping = aes(x = dir, y = hir, color = deny)) + facet_wrap(~ ccs, nrow = 2) #+# geom_smooth(mapping = aes(x = dir, y = hir))cor(all$dir,all$hir)all %>% filter(dir < 1) %>% ggplot() + geom_point(mapping = aes(x = lvr, y = dir, color = deny)) + facet_wrap(~ ccs, nrow = 2) + geom_smooth(mapping = aes(x = lvr, y = dir))all %>% filter(dir < 1, lvr < 1) %>% ggplot() + geom_point(mapping = aes(x = lvr, y = dir, color = deny)) + facet_wrap(~ ccs, nrow = 2) + geom_smooth(mapping = aes(x = lvr, y = dir))cor(all$dir,all$lvr)ggplot(data = all) + geom_boxplot(mapping = aes(x = deny, y = dir)) ggplot(data = all) + geom_boxplot(mapping = aes(x = deny, y = dir)) + facet_wrap(~ ccs, nrow = 2) all %>% filter(dir < 2) %>% ggplot() + geom_boxplot(mapping = aes(x = deny, y = dir)) + facet_wrap(~ ccs, nrow = 2) all %>% filter(lvr < 2) %>% ggplot() + geom_boxplot(mapping = aes(x = deny, y = lvr)) + facet_wrap(~ ccs, nrow = 2) all %>% filter(lvr < 2) %>% ggplot() + geom_boxplot(mapping = aes(x = deny, y = lvr, color = self)) + facet_wrap(~ ccs, nrow = 2)#lvr boxplot for african american reltaive to rest of populationall %>% filter(lvr < 2) %>% ggplot() + geom_boxplot(mapping = aes(x = deny, y = lvr, color = black)) + facet_wrap(~ ccs, nrow = 2)# The following pivot tables provide another tool aggregating and summarising relationships in datapivot1 <- all %>% group_by(deny) %>% summarize(Medianlvr = median(lvr, na.rm=TRUE), count = n()) %>% arrange(deny)View(pivot1)pivot2 <- all %>% group_by(deny, dmi) %>% summarize(Medianlvr = median(lvr, na.rm=TRUE), count = n()) %>% arrange(deny, dmi)View(pivot2)all.fit <- ctree(deny ~ .,data=all)# Figure 5 in paper#pdf("all.pdf",height=8,width=16)plot(all.fit)graphics.off()pivot3 <- all %>% group_by( deny, ccs, mcs) %>% summarize(meandir = mean(dir, na.rm=TRUE), count = n()) %>% arrange(deny, ccs, mcs)View(pivot3)