setwd("~/AREC_ps10")
# This script
# load packages
library(pacman)
p_load(tidyverse,ggplot2,skimr,GGally,broom,ranger,rsample,caret)
install.packages("janitor")
library(janitor)
# read in dataset
raw <- read_csv("Colorado Farm Density.csv")
population=read_csv("PopulationEstimates.csv")%>%
mutate(fips=str_pad(`FIPStxt`,5,"left",0))%>%
filter(Attribute=="Population 2020")
sumstats <- skimr::skim(raw)
sumstats #print the summary stats
ggpairs(raw,columns = c("Value"))
raw %>%
select(Value) %>% #subset only our variables of interest
mutate(across(everything(),log)) %>% #log transform each of the variables.
ggpairs() #plot the tranformed variables
raw %>%
filter(Value>35) %>% #keep only stores with median distance from home <48km and median dwell less than 90 minutes
select(Value) %>% #subset only our variables of interest
mutate(across(everything(),log)) %>% #log transform each of the variables.
ggpairs() #plot the tranformed variables
###################################################
#######Cluster Analysis
# scale data
data_scaled <- raw %>%
mutate(str_state=str_pad(`State ANSI`,2,"left","0"),
str_county=str_pad(`County ANSI`,3,"left","0"),
fips=str_c(str_state,str_county))%>%
inner_join(population,by="fips")
select(Value.x,Value.y,fips) %>% #subsetting only the quantitative data
scale()
write_csv(data_scaled,"data_scaled.csv")
# perform k-means clustering with k=3
set.seed(123) # for reproducibility
kmeans_fit<- kmeans(data_scaled,
centers=3, #the number of clusters
nstart = 25) #the number of random starts
#create a dataframe with the store level attribute data not included in the clustering
location_info <- raw %>%
select(State, Value, `County ANSI`)
# add cluster labels to dataset and join to location info
data_clustered <- raw %>%
mutate(cluster = kmeans_fit$cluster) %>%
inner_join(location_info,by="County ANSI")
##########
##Regression
m1 <- lm(Value.x ~ Value.y, #specifying the regression formula
data = data_clustered)
new_data <- expand_grid(Value.x=seq(0,100000,1000),
Value.y=seq(0,5000,500))
fitted_data <- augment(m1,newdata = new_data) %>%
rename(number=.fitted)
write_csv(fitted_data,"fitted_data.csv")
In R we had to merge our two data sources together and make a fips code allign for all of the counties to the data would work.
We looked at a cluster if there was a correlation to farm density and population per county. We found that as population increases for the most part farm density in those counties also increased. There was a few that did not specifically follow that assumption but most counties did follow that assumption. We also see that there is not a perfect correlation to the generated trendline.
We made a cluster map of the values of of farm density and population in the same counties.