The Relationship Between Population and Car Prices in the U.S.

---

title: "Project 520"

author: "Anup Rijal"

date: "`r Sys.Date()`"

output: pdf_document

---

```{r}

library(readr)

library(tidyverse)

library(nnet)

library(xgboost)

library(Matrix)

library(ggplot2)

library(tidyverse)

library(randomForest)

car_dataset <- read.csv("/Users/doucuments/Desktop/STAT 520 project /USA_cars_datasets.csv")

state_dataset <- read.delim("/Users/doucuments/Desktop/STAT 520 project /state-population-estimations.tsv", header = TRUE)

```

```{r}

head(car_dataset)

head(state_dataset)

```

```{r}

library(tidyverse)

population_data <- state_dataset %>%

pivot_longer(

cols = starts_with("X"),

names_to = "Year",

values_to = "Population"

) %>%

mutate(

Year = as.integer(str_remove(Year, "X")),

Population = as.numeric(str_replace_all(Population, ",", ""))

)

car_dataset <- car_dataset %>%

mutate(state = str_to_title(state))

car_dataset_with_population <- car_dataset %>%

left_join(

population_data,

by = c("state" = "State", "year" = "Year")

) %>%

na.omit()

print(head(car_dataset_with_population))

cat("Original dataset rows:", nrow(car_dataset), "\n")

cat("Clean dataset rows:", nrow(car_dataset_with_population), "\n")

```

```{r}

num_columns <- ncol(car_dataset_with_population)

num_row <- nrow(car_dataset_with_population)

print(num_columns)

print(num_row)

```

```{r}

ggplot(car_dataset_with_population, aes(x = state, y = price)) +

geom_boxplot() +

labs(title = "Boxplot of Car Prices by State",

x = "State",

y = "Car Price") +

theme(axis.text.x = element_text(angle = 90, hjust = 1))

```

```{r}

ggplot(car_dataset_with_population, aes(x = state, fill = brand)) +

geom_bar(position = "stack") +

labs(title = "Car Brands Distribution by State",

x = "State",

y = "Number of Cars") +

theme(axis.text.x = element_text(angle = 90, hjust = 1))

```

```{r}

ggplot(car_dataset_with_population, aes(x = mileage, y = price)) +

geom_point() +

labs(title = "Car Price vs Mileage",

x = "Mileage",

y = "Car Price") +

theme_minimal()

```

```{r}

ggplot(car_dataset_with_population, aes(x = price)) +

geom_histogram(bins = 30, fill = "skyblue", color = "black") +

labs(title = "Distribution of Car Prices",

x = "Car Price",

y = "Frequency") +

theme_minimal()

```

```{r}

head(car_dataset_with_population)

```

```{r}

library(dplyr)

car_dataset_with_population <- car_dataset_with_population|>

select(-condition)

```

```{r}

car_dataset_with_population <- car_dataset_with_population|>

select(-vin)

```

```{r}

car_dataset_with_population <- car_dataset_with_population|>

select(-lot)

```

```{r}

car_dataset_with_population <- car_dataset_with_population|>

select(-country)

```

```{r}

head(car_dataset_with_population)

```

```{r}

ggplot(car_dataset_with_population, aes(x = Population, y = price)) +

geom_point() +

labs(title = "Car Price vs Population", x = "Population", y = "Price") +

theme_minimal()

```

```{r}

df <- na.omit(car_dataset_with_population)

df_model <- df %>%

select(price, brand, year, mileage, state, Population)

# Convert categorical variables to factors

df_model$brand <- as.factor(df_model$brand)

df_model$state <- as.factor(df_model$state)

# Split data

set.seed(4567)

train_index <- sample(nrow(df_model), 0.8 * nrow(df_model))

train_data <- df_model[train_index, ]

test_data <- df_model[-train_index, ]

#random forest model

rf_model <- randomForest(

price ~ .,

data = train_data,

ntree = 100,

mtry = 3,

importance = TRUE

)

print(rf_model)

importance_values <- importance(rf_model)

print(importance_values)

# Plot variable importance

varImpPlot(rf_model, main = "Variable Importance for Car Price Prediction")

# Evaluate model on test data

predictions <- predict(rf_model, test_data)

rmse <- sqrt(mean((predictions - test_data$price)^2))

r2 <- 1 - sum((test_data$price - predictions)^2) / sum((test_data$price - mean(test_data$price))^2)

mae <- mean(abs(predictions - test_data$price))

cat("Test RMSE:", rmse, "\n")

cat("Test R-squared:", r2, "\n")

cat("Test MAE:", mae, "\n")

```

```{r}

library(rpart)

library(rpart.plot)

tree_model <- rpart(price ~ ., data = train_data, cp = 0.05)

rpart.plot(tree_model, box.palette = "RdBu", nn = TRUE,

main = "Decision Tree for Car Price Prediction", digits = 5)

# Plot: actual and predicted values

plot(test_data$price, predictions,

main = "Actual vs Predicted Car Prices",

xlab = "Actual Price", ylab = "Predicted Price")

abline(0, 1, col = "red")

```

# Multiple linearregression

```{r}

df_lm_simple <- df %>%

select(price, year, mileage, Population)

linear_model <- lm(price ~ ., data = df_lm_simple)

summary(linear_model)

```

```{r}

predicted <- predict(linear_model, newdata = df_lm_simple)

actual <- df_lm_simple$price

rmse <- sqrt(mean((actual - predicted)^2))

mae <- mean(abs(actual - predicted))

r_squared <- summary(linear_model)$r.squared

cat("RMSE:", rmse, "\n")

cat("MAE:", mae, "\n")

cat("R-squared:", r_squared, "\n")

```

```{r}

df_cart <- df %>%

select(price, year, mileage, Population)

#CART MODEL

cart_model <- rpart(price ~ ., data = df_cart, method = "anova")

rpart.plot(cart_model, main = "CART: Predicting Car Price", extra = 101)

```

```{r}

# Predictions

pred_cart <- predict(cart_model, newdata = df_cart)

# Calculate RMSE and MAE

rmse_cart <- sqrt(mean((df_cart$price - pred_cart)^2))

mae_cart <- mean(abs(df_cart$price - pred_cart))

r_squared_cart <- 1 - sum((df_cart$price - pred_cart)^2) / sum((df_cart$price - mean(df_cart$price))^2)

cat("CART RMSE:", rmse_cart, "\n")

cat("CART MAE:", mae_cart, "\n")

cat("CART R-squared:", r_squared_cart, "\n")

```

```{r}

df_nn <- df %>%

select(price, year, mileage, Population)

scaled_data <- as.data.frame(scale(df_nn))

X <- scaled_data[, c("year", "mileage", "Population")]

Y <- scaled_data$price

# neural network with 1 hidden layer with 3 neurons

set.seed(123)

nn_model <- nnet(X, Y, size = 3, linout = TRUE)

# Predict

pred_nn <- predict(nn_model, X)

# performance

rmse_nn <- sqrt(mean((Y - pred_nn)^2))

mae_nn <- mean(abs(Y - pred_nn))

r_squared_nn <- 1 - sum((Y - pred_nn)^2) / sum((Y - mean(Y))^2)

cat("Neural Net RMSE:", rmse_nn, "\n")

cat("Neural Net MAE:", mae_nn, "\n")

cat("Neural Net R-squared:", r_squared_nn, "\n")

```

```{r}

mean_price <- mean(df_lm_simple$price)

sd_price <- sd(df_lm_simple$price)

real_rmse <- 0.8569626 * sd_price

real_mae <- 0.6371012 * sd_price

cat("Unscaled RMSE (in $):", round(real_rmse, 2), "\n")

cat("Unscaled MAE (in $):", round(real_mae, 2), "\n")

```

```{r}

df_xgb <- df %>%

select(price, year, mileage, Population)

X <- as.matrix(df_xgb[, c("year", "mileage", "Population")])

y <- df_xgb$price

# dmatrix format for xgboost

dtrain <- xgb.DMatrix(data = X, label = y)

```

```{r}

# set param

params <- list(

objective = "reg:squarederror", # regression

eval_metric = "rmse"

)

# Training model

xgb_model <- xgb.train(

params = params,

data = dtrain,

nrounds = 100,

verbose = 0

)

```

```{r}

pred_xgb <- predict(xgb_model, newdata = dtrain)

# Calculate metrics

rmse_xgb <- sqrt(mean((y - pred_xgb)^2))

mae_xgb <- mean(abs(y - pred_xgb))

r_squared_xgb <- 1 - sum((y - pred_xgb)^2) / sum((y - mean(y))^2)

cat("XGBoost RMSE:", round(rmse_xgb, 2), "\n")

cat("XGBoost MAE:", round(mae_xgb, 2), "\n")

cat("XGBoost R-squared:", round(r_squared_xgb, 4), "\n")

```

```{r}

# Create a data frame for plotting

xgb_plot_df <- data.frame(

actual = y,

predicted = pred_xgb

)

# Plot

ggplot(xgb_plot_df, aes(x = actual, y = predicted)) +

geom_point(alpha = 0.4, color = "blue") +

geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +

labs(

title = "XGBoost: Actual vs Predicted Car Prices",

x = "Actual Price ($)",

y = "Predicted Price ($)"

) +

theme_minimal()

```

```{r}

# Feature importance

importance_matrix <- xgb.importance(model = xgb_model)

# Plot

xgb.plot.importance(importance_matrix,

top_n = 10,

measure = "Gain",

rel_to_first = TRUE,

xlab = "Relative Importance")

```