---
title: "Project 520"
author: "Anup Rijal"
date: "`r Sys.Date()`"
output: pdf_document
---
```{r}
library(readr)
library(tidyverse)
library(nnet)
library(xgboost)
library(Matrix)
library(ggplot2)
library(tidyverse)
library(randomForest)
car_dataset <- read.csv("/Users/doucuments/Desktop/STAT 520 project /USA_cars_datasets.csv")
state_dataset <- read.delim("/Users/doucuments/Desktop/STAT 520 project /state-population-estimations.tsv", header = TRUE)
```
```{r}
head(car_dataset)
head(state_dataset)
```
```{r}
library(tidyverse)
population_data <- state_dataset %>%
pivot_longer(
cols = starts_with("X"),
names_to = "Year",
values_to = "Population"
) %>%
mutate(
Year = as.integer(str_remove(Year, "X")),
Population = as.numeric(str_replace_all(Population, ",", ""))
)
car_dataset <- car_dataset %>%
mutate(state = str_to_title(state))
car_dataset_with_population <- car_dataset %>%
left_join(
population_data,
by = c("state" = "State", "year" = "Year")
) %>%
na.omit()
print(head(car_dataset_with_population))
cat("Original dataset rows:", nrow(car_dataset), "\n")
cat("Clean dataset rows:", nrow(car_dataset_with_population), "\n")
```
```{r}
num_columns <- ncol(car_dataset_with_population)
num_row <- nrow(car_dataset_with_population)
print(num_columns)
print(num_row)
```
```{r}
ggplot(car_dataset_with_population, aes(x = state, y = price)) +
geom_boxplot() +
labs(title = "Boxplot of Car Prices by State",
x = "State",
y = "Car Price") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
```
```{r}
ggplot(car_dataset_with_population, aes(x = state, fill = brand)) +
geom_bar(position = "stack") +
labs(title = "Car Brands Distribution by State",
x = "State",
y = "Number of Cars") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
```
```{r}
ggplot(car_dataset_with_population, aes(x = mileage, y = price)) +
geom_point() +
labs(title = "Car Price vs Mileage",
x = "Mileage",
y = "Car Price") +
theme_minimal()
```
```{r}
ggplot(car_dataset_with_population, aes(x = price)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Car Prices",
x = "Car Price",
y = "Frequency") +
theme_minimal()
```
```{r}
head(car_dataset_with_population)
```
```{r}
library(dplyr)
car_dataset_with_population <- car_dataset_with_population|>
select(-condition)
```
```{r}
car_dataset_with_population <- car_dataset_with_population|>
select(-vin)
```
```{r}
car_dataset_with_population <- car_dataset_with_population|>
select(-lot)
```
```{r}
car_dataset_with_population <- car_dataset_with_population|>
select(-country)
```
```{r}
head(car_dataset_with_population)
```
```{r}
ggplot(car_dataset_with_population, aes(x = Population, y = price)) +
geom_point() +
labs(title = "Car Price vs Population", x = "Population", y = "Price") +
theme_minimal()
```
```{r}
df <- na.omit(car_dataset_with_population)
df_model <- df %>%
select(price, brand, year, mileage, state, Population)
# Convert categorical variables to factors
df_model$brand <- as.factor(df_model$brand)
df_model$state <- as.factor(df_model$state)
# Split data
set.seed(4567)
train_index <- sample(nrow(df_model), 0.8 * nrow(df_model))
train_data <- df_model[train_index, ]
test_data <- df_model[-train_index, ]
#random forest model
rf_model <- randomForest(
price ~ .,
data = train_data,
ntree = 100,
mtry = 3,
importance = TRUE
)
print(rf_model)
importance_values <- importance(rf_model)
print(importance_values)
# Plot variable importance
varImpPlot(rf_model, main = "Variable Importance for Car Price Prediction")
# Evaluate model on test data
predictions <- predict(rf_model, test_data)
rmse <- sqrt(mean((predictions - test_data$price)^2))
r2 <- 1 - sum((test_data$price - predictions)^2) / sum((test_data$price - mean(test_data$price))^2)
mae <- mean(abs(predictions - test_data$price))
cat("Test RMSE:", rmse, "\n")
cat("Test R-squared:", r2, "\n")
cat("Test MAE:", mae, "\n")
```
```{r}
library(rpart)
library(rpart.plot)
tree_model <- rpart(price ~ ., data = train_data, cp = 0.05)
rpart.plot(tree_model, box.palette = "RdBu", nn = TRUE,
main = "Decision Tree for Car Price Prediction", digits = 5)
# Plot: actual and predicted values
plot(test_data$price, predictions,
main = "Actual vs Predicted Car Prices",
xlab = "Actual Price", ylab = "Predicted Price")
abline(0, 1, col = "red")
```
# Multiple linearregression
```{r}
df_lm_simple <- df %>%
select(price, year, mileage, Population)
linear_model <- lm(price ~ ., data = df_lm_simple)
summary(linear_model)
```
```{r}
predicted <- predict(linear_model, newdata = df_lm_simple)
actual <- df_lm_simple$price
rmse <- sqrt(mean((actual - predicted)^2))
mae <- mean(abs(actual - predicted))
r_squared <- summary(linear_model)$r.squared
cat("RMSE:", rmse, "\n")
cat("MAE:", mae, "\n")
cat("R-squared:", r_squared, "\n")
```
```{r}
df_cart <- df %>%
select(price, year, mileage, Population)
#CART MODEL
cart_model <- rpart(price ~ ., data = df_cart, method = "anova")
rpart.plot(cart_model, main = "CART: Predicting Car Price", extra = 101)
```
```{r}
# Predictions
pred_cart <- predict(cart_model, newdata = df_cart)
# Calculate RMSE and MAE
rmse_cart <- sqrt(mean((df_cart$price - pred_cart)^2))
mae_cart <- mean(abs(df_cart$price - pred_cart))
r_squared_cart <- 1 - sum((df_cart$price - pred_cart)^2) / sum((df_cart$price - mean(df_cart$price))^2)
cat("CART RMSE:", rmse_cart, "\n")
cat("CART MAE:", mae_cart, "\n")
cat("CART R-squared:", r_squared_cart, "\n")
```
```{r}
df_nn <- df %>%
select(price, year, mileage, Population)
scaled_data <- as.data.frame(scale(df_nn))
X <- scaled_data[, c("year", "mileage", "Population")]
Y <- scaled_data$price
# neural network with 1 hidden layer with 3 neurons
set.seed(123)
nn_model <- nnet(X, Y, size = 3, linout = TRUE)
# Predict
pred_nn <- predict(nn_model, X)
# performance
rmse_nn <- sqrt(mean((Y - pred_nn)^2))
mae_nn <- mean(abs(Y - pred_nn))
r_squared_nn <- 1 - sum((Y - pred_nn)^2) / sum((Y - mean(Y))^2)
cat("Neural Net RMSE:", rmse_nn, "\n")
cat("Neural Net MAE:", mae_nn, "\n")
cat("Neural Net R-squared:", r_squared_nn, "\n")
```
```{r}
mean_price <- mean(df_lm_simple$price)
sd_price <- sd(df_lm_simple$price)
real_rmse <- 0.8569626 * sd_price
real_mae <- 0.6371012 * sd_price
cat("Unscaled RMSE (in $):", round(real_rmse, 2), "\n")
cat("Unscaled MAE (in $):", round(real_mae, 2), "\n")
```
```{r}
df_xgb <- df %>%
select(price, year, mileage, Population)
X <- as.matrix(df_xgb[, c("year", "mileage", "Population")])
y <- df_xgb$price
# dmatrix format for xgboost
dtrain <- xgb.DMatrix(data = X, label = y)
```
```{r}
# set param
params <- list(
objective = "reg:squarederror", # regression
eval_metric = "rmse"
)
# Training model
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100,
verbose = 0
)
```
```{r}
pred_xgb <- predict(xgb_model, newdata = dtrain)
# Calculate metrics
rmse_xgb <- sqrt(mean((y - pred_xgb)^2))
mae_xgb <- mean(abs(y - pred_xgb))
r_squared_xgb <- 1 - sum((y - pred_xgb)^2) / sum((y - mean(y))^2)
cat("XGBoost RMSE:", round(rmse_xgb, 2), "\n")
cat("XGBoost MAE:", round(mae_xgb, 2), "\n")
cat("XGBoost R-squared:", round(r_squared_xgb, 4), "\n")
```
```{r}
# Create a data frame for plotting
xgb_plot_df <- data.frame(
actual = y,
predicted = pred_xgb
)
# Plot
ggplot(xgb_plot_df, aes(x = actual, y = predicted)) +
geom_point(alpha = 0.4, color = "blue") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(
title = "XGBoost: Actual vs Predicted Car Prices",
x = "Actual Price ($)",
y = "Predicted Price ($)"
) +
theme_minimal()
```
```{r}
# Feature importance
importance_matrix <- xgb.importance(model = xgb_model)
# Plot
xgb.plot.importance(importance_matrix,
top_n = 10,
measure = "Gain",
rel_to_first = TRUE,
xlab = "Relative Importance")
```