#Download Total.csv from FYP drive for verification.
#Random Forest Regression!
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
# Read in data
features = pd.read_csv('Total.csv')
features.head(7)
# Labels are the values we want to predict
labels1 = np.array(features['122.57'])
labels2 = np.array(features['58.825'])
labels3=np.vstack([labels1, labels2])
label4=np.transpose(labels3)
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('122.57', axis = 1)
features= features.drop('58.825', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)
# Using Skicit-learn to split data into training and testing sets
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, label4, test_size = 0.2, random_state = 42)
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae and rmse)
errs = metrics.mean_absolute_error(predictions, test_labels)
errs1 = metrics.mean_squared_error(predictions, test_labels)
print(errs)
print(sqrt(errs1))
3 features:
7 features:
('Mean Absolute Error:', 3.555187104277004)
('Mean Squared Error:', 85.12462334840987)
('Root Mean Squared Error:', 9.2263006318030776)
Variance score: 0.88