3. Data Modelling (Validation)

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size = 0.2, random_state=7)

X = train.values[:,0:4]

Y = train.values[:,4]

x_test = test.values[:,0:4]

y_test = test.values[:,4]

from sklearn import linear_model, neighbors

from sklearn.metrics import accuracy_score

classifier = linear_model.LogisticRegression(solver='liblinear', multi_class='ovr')

classifier.fit(X,Y)

predictions=classifier.predict(x_test)

print("Accuracy Score of Logistic Regression: ", accuracy_score(y_test, predictions))

classifier = neighbors.KNeighborsClassifier()

classifier.fit(X,Y)

predictions=classifier.predict(x_test)

print("Accuracy Score of KNN: ", accuracy_score(y_test, predictions))

Accuracy Score of Logistic Regression: 0.8

Accuracy Score of KNN: 0.9

# To randomly splits the training set into 10 distinct subsets to train and evaluate the models 10 times and compare results

from sklearn import model_selection

models = {}

models['LR'] = linear_model.LogisticRegression(solver='liblinear', multi_class='ovr')

models['KNN'] = neighbors.KNeighborsClassifier()

results = []

names = []

score = 'accuracy'

for name in models:

    model = models.get(name)

    kfold = model_selection.KFold(n_splits=10, random_state =7)

    cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=score)

    results.append(cv_results)

    names.append(name)

    print('{}: {} ({})'.format(name, cv_results.mean(), cv_results.std()))

Google Sites

Report abuse