モデル
scikit-learn
★DataSet取得
ロード
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
作成
from sklearn.datasets.samples_generator import make_classification
X, y = make_classification(n_samples=6, n_features=5, n_informative=2,
n_redundant=2, n_classes=2, n_clusters_per_class=2, scale=1.0,
random_state=20)
★前処理
scikit-learn
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # 平均と標準偏差を計算
scaler.fit(trainX)
trainX2 = scaler.transform(trainX)
testX2 = scaler.transform(testX)
# trainX2 = scaler.fit_transform(trainX)
Numpy自作
def norm(x):
x = np.array(x)
x_norm = (x-np.mean(x))/np.std(x)
return x_norm
フィルター方法
from sklearn.feature_selection import VarianceThreshold
var = VarianceThreshold(threshold=1.0)
data = var.fit_transform(iris.data)
LDA(Linear Discriminant Analysis) vs PCA(Principal Component Analysis)
from sklearn.lda import LDA
lda = LDA(n_components=0.9)
data = lda.fit_transform(iris.data, iris.target)
from sklearn.decomposition import PCA
pca = PCA(n_components=0.9)
data = pca.fit_transform(iris.data)
★DataSet分割
from sklearn.mode_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
★モデル
Parameter全般
途中経過 verbose=1(defaultは0)
最大回数 max_iter(defaultは-1(永久))
並列thread数 n_jobs=-1(最大)
その他 tol、random_stateなど
Method全般
fit(X_train,y_train) 学習を行う
predict(X_test) 予測する
predict_proba(X_test) 確率を返す
get_params() パラメータを取得する
score(X_test,y_test) 予測スコアを返す
Supervised Machine Learning(教師あり学習)の例
# Support Vector Machine
from sklearn.svm import SVC
# K Nearest Neighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Gradient Boosting Decision Tree
from sklearn.ensemble import GradientBoostingClassifier
Unsupervised Machine Learning(教師なし学習)の例
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2)
# Gaussian Mixture Model
from sklearn.mixture import GMM
model = GMM(n_components=2)
# Restricted Boltzmann Machine
from sklearn.neural_network import BernoulliRBM
model = BernoulliRBM(n_components=2)
★精度向上
数据优化(Data Optimization)
平衡数据集(Balance your data set)
二次采样多数类(Subsample Majority Class)
过度采样少数类(Oversample Minority Class)
More Data + Bigger Models(NN etc.)
算法调整(Algorithm tuning)
算法抽查(Algorithm spot check)
超参数优化(Hyper-Parameter Optimization)
学习速率(Learning rates)
批(batch)的大小和期(epoch)的数量
提前停止(Early Stopping)
正则化(Regularization)
合并(Ensembles)
合并模型 / 视图(Bagging)
堆积(Stacking)
★モデルの保存と読み込み
方法1
from sklearn.externals import joblib
joblib.dump(model, 'model.pkl')
model_new = joblib.load('model.pkl')
方法2
import pickle
with open('model.pkl', 'wb') as f:
pickle.dump(model, f)
with open('model.pkl', 'rb') as f:
model = pickle.load(f)
★評価関数
Regression
MAE(Mean Absolute Error) / l1-norm loss
MSE(Mean Squared Error) / l2-norm loss
RMSE(Root Mean Squared Error)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)
Classification
Confusion Matrix
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
TNR = TN /(FP + TN) = 1 - FPR
Precision、Recall、F1
例:ある池に鯉が1400、エビが300、亀が300として、鯉を捕まるのは目的で
>700鯉、200エビ、100亀を捕まった場合
Precision = 700 / (700 + 200 + 100) = 70%
Recall = 700 / 1400 = 50%
F1 = 70% * 50% * 2 / (70% + 50%) = 58.3%
>全部捕まった場合
Precision = 1400 / (1400 + 300 + 300) = 70%
Recall = 1400 / 1400 = 100%
F1 = 70% * 100% * 2 / (70% + 100%) = 82.35%
from sklearn.metrics import confusion_matrix
print( confusion_matrix(y_true=y_test, y_pred=y_pred) )
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=y_pred))
ROC(Receiver Operating Characteristic)とAUC(Area Under Curve)
※ROCは左上の方が良い、1 > AUC > 0.5は目標
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc))
plt.xlabel("FPR (False Positive Rate)")
plt.ylabel("TPR (True Positive Rate)")
plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)"% (roc_auc))
plt.show()
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)
LinearRegression
from sklearn.linear_model import LinearRegression as LR
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample.csv', header=None)
trainX = pd.get_dummies(train[['week', 'temp']])
testX = pd.get_dummies(test[['week', 'temp']])
y = train['y']
# 単回帰の場合のみ
# trainX = trainX.values.reshape(-1, 1)
# testX = testX.values.reshape(-1, 1)
model = LR()
model.fit(trainX, y)
# np.set_printoptions(precision=3, suppress=True)
model.coef_
model.intercept_
model.score(trainX, y)
pred = model.predict(testX)
sample[1] = pred
sample.to_csv('submit.csv', index=None, header=None)
DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample.csv', header=None)
trainX = train.iloc[:, :-1]
testX = test.copy()
y = train['y']
trainX = pd.get_dummies(trainX)
testX = pd.get_dummies(testX)
from sklearn.model_selection import cross_validate
model = DT(max_depth=4, min_samples_leaf=500)
# cross_validate(model, trainX, y, cv=5, scoring='roc_auc', n_jobs=-1)
model.fit(trainX, y)
export_graphviz(model, out_file='tree.dot', feature_names=trainX.columns, class_names=['0', '1'], filled=True, rounded=True)
g = pydotplus.graph_from_dot_file(path='tree.dot')
Image(g.create_png())
pred = model.predict_proba(testX)
pred = pred[:, 1]
sample[1] = pred
sample.to_csv('submit.csv', index=None, header=None)
★評価指標(Evaluation metrics)
・回帰タスク:RMSE、MAE、決定係数R2
・二値分類
・0か1の2種類のラベル:混同行列、正答率、誤答率、適合率、再現率、F1-score、MCC
・0から1の間の確率:logloss、AUC
・多クラス分類
・マルチクラス分類:multi-class accuracy/logloss
・マルチラベル分類:mean-F1、macro-F1、micro-F1
・レコメンデーション
・順位を付ける場合:MAP@K
・順位を付けない場合:マルチラベル分類と同じ
GridSearchCV
from sklearn.model_selection import GridSearchCV
model = DT()
parameters = {
'max_depth': list(range(2, 11)),
'min_samples_leaf': [5, 10, 20, 50]
}
gcv = GridSearchCV(model, parameters, y, cv=5, scoring='roc_auc', n_jobs=-1)
gcv.fit(trainX, y)
train_score = gcv.cv_results_['mean_train_score']
test_score = gcv.cv_results_['mean_test_score']
plt.plot(train_score)
plt.plot(test_score)
plt.xticks(list(range(0, 10)), list(range(2, 11)))
gcv.best_params_
pred = gcv.predict_proba(testX)
pred = pred[:, 1]
sample[1] = pred
sample.to_csv('submit.csv', index=None, header=None)
★各種関数
学習中に最適化されるのが目的関数
学習後に良さを確認するための指標が評価関数
目的関数 objective function
コスト関数 cost function
誤差関数 error function
損失関数 loss function
損失関数 + 正則化項(過学習を防ぐ) = コスト関数
損失関数 = 誤差関数
目的関数 ⊃ コスト関数、誤差関数、損失関数