---

title: "Project 520"

author: "Anup Rijal"

date: "`r Sys.Date()`"

output: pdf_document

---


```{r}

library(readr)

library(tidyverse)

library(nnet)

library(xgboost)

library(Matrix)

library(ggplot2)

library(tidyverse)

library(randomForest)


car_dataset <- read.csv("/Users/doucuments/Desktop/STAT 520 project /USA_cars_datasets.csv")

state_dataset <- read.delim("/Users/doucuments/Desktop/STAT 520 project /state-population-estimations.tsv", header = TRUE)

```


```{r}

head(car_dataset)

head(state_dataset)


```

```{r}

library(tidyverse)


population_data <- state_dataset %>%

  pivot_longer(

    cols = starts_with("X"),

    names_to = "Year", 

    values_to = "Population"

  ) %>%

  mutate(

    Year = as.integer(str_remove(Year, "X")),

    Population = as.numeric(str_replace_all(Population, ",", ""))

  )


car_dataset <- car_dataset %>%

  mutate(state = str_to_title(state))


car_dataset_with_population <- car_dataset %>%

  left_join(

    population_data, 

    by = c("state" = "State", "year" = "Year")

  ) %>%

  na.omit()


print(head(car_dataset_with_population))

cat("Original dataset rows:", nrow(car_dataset), "\n")

cat("Clean dataset rows:", nrow(car_dataset_with_population), "\n")

```

```{r}

num_columns <- ncol(car_dataset_with_population)

num_row <- nrow(car_dataset_with_population)


print(num_columns)

print(num_row)

```


```{r}


ggplot(car_dataset_with_population, aes(x = state, y = price)) +

  geom_boxplot() +

  labs(title = "Boxplot of Car Prices by State",

       x = "State",

       y = "Car Price") +

  theme(axis.text.x = element_text(angle = 90, hjust = 1))


```

```{r}

ggplot(car_dataset_with_population, aes(x = state, fill = brand)) +

  geom_bar(position = "stack") +

  labs(title = "Car Brands Distribution by State",

       x = "State",

       y = "Number of Cars") +

  theme(axis.text.x = element_text(angle = 90, hjust = 1))


```

```{r}

ggplot(car_dataset_with_population, aes(x = mileage, y = price)) +

  geom_point() +

  labs(title = "Car Price vs Mileage",

       x = "Mileage",

       y = "Car Price") +

  theme_minimal()


```

```{r}

ggplot(car_dataset_with_population, aes(x = price)) +

  geom_histogram(bins = 30, fill = "skyblue", color = "black") +

  labs(title = "Distribution of Car Prices",

       x = "Car Price",

       y = "Frequency") +

  theme_minimal()


```

```{r}

head(car_dataset_with_population)

```

```{r}

library(dplyr)

car_dataset_with_population <- car_dataset_with_population|> 

  select(-condition)

```


```{r}

car_dataset_with_population <- car_dataset_with_population|> 

  select(-vin)

  

```

```{r}

car_dataset_with_population <- car_dataset_with_population|> 

  select(-lot)

  

```

```{r}

car_dataset_with_population <- car_dataset_with_population|> 

  select(-country)

```


```{r}

head(car_dataset_with_population)

  

```

```{r}

ggplot(car_dataset_with_population, aes(x = Population, y = price)) +

  geom_point() +

  labs(title = "Car Price vs Population", x = "Population", y = "Price") +

  theme_minimal()

```


```{r}

df <- na.omit(car_dataset_with_population)

df_model <- df %>%

  select(price, brand, year, mileage, state, Population)


# Convert categorical variables to factors

df_model$brand <- as.factor(df_model$brand)

df_model$state <- as.factor(df_model$state)


# Split data 

set.seed(4567)

train_index <- sample(nrow(df_model), 0.8 * nrow(df_model))

train_data <- df_model[train_index, ]

test_data <- df_model[-train_index, ]


#random forest model

rf_model <- randomForest(

  price ~ ., 

  data = train_data, 

  ntree = 100,

  mtry = 3,  

  importance = TRUE

)


print(rf_model)

importance_values <- importance(rf_model)

print(importance_values)


# Plot variable importance

varImpPlot(rf_model, main = "Variable Importance for Car Price Prediction")


# Evaluate model on test data

predictions <- predict(rf_model, test_data)

rmse <- sqrt(mean((predictions - test_data$price)^2))

r2 <- 1 - sum((test_data$price - predictions)^2) / sum((test_data$price - mean(test_data$price))^2)

mae <- mean(abs(predictions - test_data$price))


cat("Test RMSE:", rmse, "\n")

cat("Test R-squared:", r2, "\n")

cat("Test MAE:", mae, "\n")

```


```{r}

library(rpart)

library(rpart.plot)

tree_model <- rpart(price ~ ., data = train_data, cp = 0.05)

rpart.plot(tree_model, box.palette = "RdBu", nn = TRUE,

           main = "Decision Tree for Car Price Prediction", digits = 5)


# Plot: actual and predicted values

plot(test_data$price, predictions, 

     main = "Actual vs Predicted Car Prices",

     xlab = "Actual Price", ylab = "Predicted Price")

abline(0, 1, col = "red")

```

# Multiple linearregression 


```{r}

df_lm_simple <- df %>%

  select(price, year, mileage, Population)

linear_model <- lm(price ~ ., data = df_lm_simple)

summary(linear_model)

```


```{r}

predicted <- predict(linear_model, newdata = df_lm_simple)

actual <- df_lm_simple$price


rmse <- sqrt(mean((actual - predicted)^2))

mae <- mean(abs(actual - predicted))

r_squared <- summary(linear_model)$r.squared


cat("RMSE:", rmse, "\n")

cat("MAE:", mae, "\n")

cat("R-squared:", r_squared, "\n")

```


```{r}

df_cart <- df %>%

  select(price, year, mileage, Population)

#CART MODEL

cart_model <- rpart(price ~ ., data = df_cart, method = "anova")


rpart.plot(cart_model, main = "CART: Predicting Car Price", extra = 101)

```

```{r}

# Predictions

pred_cart <- predict(cart_model, newdata = df_cart)


# Calculate RMSE and MAE

rmse_cart <- sqrt(mean((df_cart$price - pred_cart)^2))

mae_cart <- mean(abs(df_cart$price - pred_cart))

r_squared_cart <- 1 - sum((df_cart$price - pred_cart)^2) / sum((df_cart$price - mean(df_cart$price))^2)


cat("CART RMSE:", rmse_cart, "\n")

cat("CART MAE:", mae_cart, "\n")

cat("CART R-squared:", r_squared_cart, "\n")


```


```{r}

df_nn <-  df %>%

  select(price, year, mileage, Population)


scaled_data <- as.data.frame(scale(df_nn))


X <- scaled_data[, c("year", "mileage", "Population")]

Y <- scaled_data$price


# neural network with 1 hidden layer with 3 neurons

set.seed(123)

nn_model <- nnet(X, Y, size = 3, linout = TRUE)


# Predict

pred_nn <- predict(nn_model, X)


# performance

rmse_nn <- sqrt(mean((Y - pred_nn)^2))

mae_nn <- mean(abs(Y - pred_nn))

r_squared_nn <- 1 - sum((Y - pred_nn)^2) / sum((Y - mean(Y))^2)


cat("Neural Net RMSE:", rmse_nn, "\n")

cat("Neural Net MAE:", mae_nn, "\n")

cat("Neural Net R-squared:", r_squared_nn, "\n")

```

```{r}

mean_price <- mean(df_lm_simple$price)

sd_price <- sd(df_lm_simple$price)


real_rmse <- 0.8569626 * sd_price

real_mae  <- 0.6371012 * sd_price


cat("Unscaled RMSE (in $):", round(real_rmse, 2), "\n")

cat("Unscaled MAE (in $):", round(real_mae, 2), "\n")

```


```{r}

df_xgb <-  df %>%

  select(price, year, mileage, Population)

X <- as.matrix(df_xgb[, c("year", "mileage", "Population")])

y <- df_xgb$price


# dmatrix format for xgboost

dtrain <- xgb.DMatrix(data = X, label = y)


```


```{r}

# set param

params <- list(

  objective = "reg:squarederror",  # regression

  eval_metric = "rmse"

)

# Training model

xgb_model <- xgb.train(

  params = params,

  data = dtrain,

  nrounds = 100,

  verbose = 0

)

```


```{r}

pred_xgb <- predict(xgb_model, newdata = dtrain)


# Calculate metrics

rmse_xgb <- sqrt(mean((y - pred_xgb)^2))

mae_xgb <- mean(abs(y - pred_xgb))

r_squared_xgb <- 1 - sum((y - pred_xgb)^2) / sum((y - mean(y))^2)

cat("XGBoost RMSE:", round(rmse_xgb, 2), "\n")

cat("XGBoost MAE:", round(mae_xgb, 2), "\n")

cat("XGBoost R-squared:", round(r_squared_xgb, 4), "\n")


```

```{r}

# Create a data frame for plotting

xgb_plot_df <- data.frame(

  actual = y,

  predicted = pred_xgb

)


# Plot

ggplot(xgb_plot_df, aes(x = actual, y = predicted)) +

  geom_point(alpha = 0.4, color = "blue") +

  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +

  labs(

    title = "XGBoost: Actual vs Predicted Car Prices",

    x = "Actual Price ($)",

    y = "Predicted Price ($)"

  ) +

  theme_minimal()

```

```{r}

# Feature importance

importance_matrix <- xgb.importance(model = xgb_model)

# Plot

xgb.plot.importance(importance_matrix, 

                    top_n = 10, 

                    measure = "Gain", 

                    rel_to_first = TRUE, 

                    xlab = "Relative Importance")


```