Implement logistic regression scoring without scikit-learn

Post date: Oct 29, 2016 5:21:36 PM

I build a scoring system using logistic regression on AWS lambda. However, it's challenging to install numpy or scikit-learn packages on lambda (at the time of writing this). So, a good way is to train the logistic regression model offline using scikit-learn and send the parameters (weights + bias) to use in the lambda containers. Here is what I did offline.

This example is for binary classification on Iris dataset. I use label 0 and 1 and discard the rest.

The final score of binary classification using logistic regression is simply

sigmoid(wtx) = 1/(1 + exp(-wtx)) 

where w is [weights, bias]. Refer to section logistic on this url for more details.

##################################################################
# Modeling
##################################################################  
# Logistic Regression
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from math import exp, log
# load the iris datasets
dataset = datasets.load_iris()
target = dataset.target[dataset.target < 2]
data = dataset.data[dataset.target < 2]
# fit a logistic regression model to the data
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
model.fit(data, target)
print(model)
# make predictions
expected = target
predicted = model.predict(data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
# Output prediction probability
model.predict_proba(data)
# array([[ 0.98390917,  0.01609083],
#        [ 0.9644754 ,  0.0355246 ],
#        [ 0.97670816,  0.02329184],
#        [ 0.95694433,  0.04305567],
#        [ 0.98549492,  0.01450508],
#        [ 0.98103758,  0.01896242],
#        [ 0.97497879,  0.02502121],
#        [ 0.97593072,  0.02406928],
#        [ 0.95033942,  0.04966058],
#        [ 0.9652527 ,  0.0347473 ],
#        [ 0.98666133,  0.01333867],
#        [ 0.96761024,  0.03238976],
#        [ 0.96647839,  0.03352161],
#        [ 0.97874802,  0.02125198],
#        [ 0.9962299 ,  0.0037701 ],
#        [ 0.99476333,  0.00523667],
#        [ 0.99217185,  0.00782815],
#        [ 0.98223225,  0.01776775],
#        [ 0.98241722,  0.01758278],
#        [ 0.98560983,  0.01439017],
#        [ 0.96824309,  0.03175691],
#        [ 0.98165317,  0.01834683],
#        [ 0.99297853,  0.00702147],
# Now I will get the weights + bias from the model
w = model.coef_
c = model.intercept_
# using numpy
import numpy as np
def sigmoid(x,w,c):
    return 1/(1+np.exp(-1*np.inner(w,x)-c))
# This will work without numpy, so this is AWS-lambda-friendly.
def sigmoid(x,w,c):
    return  1.0/(1 + exp(-1*sum([xi*wi for xi,wi in zip(x,w)])-c))
for i in range(data.shape[0]):
    print sigmoid(data[i].tolist(),w.tolist()[0],c[0])
# 0.016090826738
# 0.0355246004109
# 0.0232918385259
# 0.0430556690477
# 0.0145050823625
# 0.0189624168471
# 0.0250212147231
# 0.0240692765975
# 0.0496605772698
# 0.0347473007216
# 0.0133386701762
# 0.0323897551789
# 0.0335216095562
# 0.021251981282
# 0.00377009533942
# 0.00523667363672
# 0.0078281454093
# 0.0177677502829
# 0.0175827783602
# 0.0143901712337
# 0.0317569070515
# 0.0183468294599
# 0.00702146758676