import xgboost
import numpy as np
#example data for X and y
y = np.random.randint(0,2,1000)
X = np.random.rand(y.size, 3)
X_pred = np.random.rand(10, 3) #the X for testing performance
#training data in xgboost's own Matrix format
feature_names = [f'col{i}' for i in range(X.shape[1])]
xgtrain = xgboost.DMatrix(X, y, feature_names = feature_names)
xgtest = xgboost.DMatrix(data = X_pred, feature_names = feature_names)
# Evaluation metric: auc, map, map@n, aucpr, ndcg, ndcg@n
# The corresponding cross validataion cv_metric_name = 'test-auc-mean'
eval_metric = 'auc'
cv_metric_name = 'test-auc-mean'
##### booster parameters #####
# learning_rate: the step-size of shrinkage. when adding an additional tree f(x), we usually add only a small portion of the f(x)
# which means we dont do full optimization when adding a new tree, but reserve chance for future rounds
# this helps reduce overfitting. The smaller learning rate, the more conservative the training is. 0.01 ~ 0.2 can be good.
# min_child_weight: Defines the minimum sum of weights of all observations required in a child. default 1.
# Used to control over-fitting. Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.
# gamma: the minimum loss reduction required to make a further leaf split. It has to be at least 0.
# The values can vary depending on the loss function and should be tuned.
# max_depth: the maximum depth of the trees.
# Small trees are preferred because complex tree structure with many leaves tend to overfit the training data.
# usually hundreds of small trees is better than dozens of large trees. 3-10. Better use 3~6.
# max_leaves: similar to max_depth, but constrain the tree size of leaves.
# Don't need to use this. Use the max_depth instead.
# subsample: When constructing an additional tree, don't use the full training data set. Instead use a subset of randomly selected training data.
# colsample_bytree: similar to subsample, this selects a random subset of features (columns) to construct an additional tree.
# colsample_bylevel: percentage of traning data selected at each level split. No need to use this parameter if subsample & colsample_bytree are used already.
# lambda: The weight of the L2 regularization term
# alpha: the weight of the L1 regularization term
# objective: "reg:linear" --linear regression
# "reg:logistic" --logistic regression
# "binary:logistic" --logistic regression for binary classification, output probability
''' Note: the max_depth and min_child_weight have the highest impact on model outcome. tune the two parameters first and then the rest.'''
parameters = {
'booster': 'gbtree', #gbtree, gblinear, dart
'objective': 'binary:logistic', #log loss as objective function
'eval_metric': eval_metric, #auc
'base_score': 0.5, #starting score
'eta': 0.1, #set to 0.1, relatively high value, will try 0.01 later
'gamma': 0, #minimum loss requirement for leaf split
'max_depth': 5, #set to 5, may try 3 ~ 10
'min_child_weight': 1, #default 1
'max_delta_step': 0,
'subsample': 0.8, #start with 80% sampling rate
'colsample_bytree': 0.8,#start with 80% sampling rate
'colsample_bylevel': 1,
'lambda': 1,
'alpha': 0.1,
'scale_pos_weight': 1, #ratio #negative/#positive , default 1
'nthread': 4, #4 core cpu
'seed': 88,
'silent': 1
}
#### rounds of parameter tunning #####
'''grid search with the built-in xgboost.cv so cross validation at each iteration'''
best_score = 0
for eta in [0.05, 0.04, 0.03, 0.02]:
for max_depth in [4, 5, 6]:
for min_child_weight in [3,2,1.5,1]:
parameters['eta'] = eta
parameters['max_depth'] = max_depth
parameters['min_child_weight'] = min_child_weight
cv_result = xgboost.cv( #run cross validation with the given parameters
parameters,
xgtrain,
num_boost_round = 100, #leave it a big number
nfold=5, #10 fold cross validation
early_stopping_rounds = 20 #if no improvement, stop the iteration early
)
#keep the best score as it goes through the grid search
if cv_result[cv_metric_name][cv_result.shape[0]-1] > best_score:
best_score = cv_result[cv_metric_name][cv_result.shape[0]-1]
best_eta= eta
best_max_depth = max_depth
best_min_child_weight = min_child_weight
best_boosting_rounds = cv_result.shape[0] #the number of trees
print('====eta:{0}, max depth:{1}, min child weight:{2} --- best rounds: {3} score: {4}'.format(eta,max_depth,min_child_weight,cv_result.shape[0], cv_result[cv_metric_name][cv_result.shape[0]-1]))
print('the best {0} is: {1}'.format(eval_metric, best_score))
print(' boosting rounds {0}'.format(best_boosting_rounds))
print(' learning reate {0}'.format(best_eta))
print(' max depth {0}'.format(best_max_depth))
print(' min child weight {0}'.format(best_min_child_weight))
parameters['eta'] = best_eta
parameters['max_depth'] = best_max_depth
parameters['min_child_weight'] = best_min_child_weight
'''grid search for gamma'''
best_score = 0
for gamma in [i/10.0 for i in range(0,6)]:
parameters['gamma'] = gamma
cv_result = xgboost.cv(
parameters,
xgtrain,
num_boost_round = 100,
nfold=5,
early_stopping_rounds = 20
)
if cv_result[cv_metric_name][cv_result.shape[0]-1] > best_score:
best_score = cv_result[cv_metric_name][cv_result.shape[0]-1]
best_gamma = gamma
best_boosting_rounds = cv_result.shape[0]
print('====gamma:{0} --- best rounds: {1} score: {2}'.format(gamma,cv_result.shape[0], cv_result[cv_metric_name][cv_result.shape[0]-1]))
print('the best {0} is: {1}'.format(eval_metric, best_score))
print(' boosting rounds {0}'.format(best_boosting_rounds))
print(' gamma {0}'.format(best_gamma))
parameters['gamma'] = best_gamma
'''grid search for subsample and colsample_bytree'''
best_score = 0
for subsample in [0.6,0.7,0.8,0.9]:
for colsample_bytree in [0.6,0.7,0.8,0.9]:
#for colsample_bylevel in [0.8, 0.9, 1.0]:
parameters['subsample'] = subsample
parameters['colsample_bytree'] = colsample_bytree
cv_result = xgboost.cv(
parameters,
xgtrain,
num_boost_round = 100,
nfold=5,
early_stopping_rounds = 20
)
if cv_result[cv_metric_name][cv_result.shape[0]-1] > best_score:
best_score = cv_result[cv_metric_name][cv_result.shape[0]-1]
best_subsample = subsample
best_colsample_bytree = colsample_bytree
best_boosting_rounds = cv_result.shape[0]
print('====subsample:{0}, colsample_bytree:{1} --- best rounds: {2} score: {3}'.format(subsample,colsample_bytree,cv_result.shape[0], cv_result[cv_metric_name][cv_result.shape[0]-1]))
print('the best {0} is: {1}'.format(eval_metric, best_score))
print(' boosting rounds {0}'.format(best_boosting_rounds))
print(' subsample {0}'.format(best_subsample))
print(' colsample_bytree {0}'.format(best_colsample_bytree))
parameters['subsample'] = best_subsample
parameters['colsample_bytree'] = best_colsample_bytree
'''grid search for L1 and L2 regularization'''
best_score = 0
for reg_alpha in [0.1, 0.2, 0.5, 1]:
for reg_lambda in [0.5, 1, 2, 3]:
parameters['alpha'] = reg_alpha
parameters['lambda'] = reg_lambda
cv_result = xgboost.cv(
parameters,
xgtrain,
num_boost_round = 100,
nfold=5,
early_stopping_rounds = 20
)
if cv_result[cv_metric_name][cv_result.shape[0]-1] > best_score:
best_score = cv_result[cv_metric_name][cv_result.shape[0]-1]
best_alpha = reg_alpha
best_lambda = reg_lambda
best_boosting_rounds = cv_result.shape[0]
print('====alpha:{0}, lambda:{1} --- best rounds: {2} score: {3}'.format(reg_alpha,reg_lambda,cv_result.shape[0], cv_result[cv_metric_name][cv_result.shape[0]-1]))
print('the best {0} is: {1}'.format(eval_metric, best_score))
print(' boosting rounds {0}'.format(best_boosting_rounds))
print(' alpha {0}'.format(best_alpha))
print(' lambda {0}'.format(best_lambda))
parameters['alpha'] = best_alpha
parameters['lambda'] = best_lambda
'''refit the model with the best parameters'''
best_booster = xgboost.train(
params = parameters
,dtrain = xgtrain
,num_boost_round=best_boosting_rounds
)
y_pred = best_booster.predict(xgtest)
print(y_pred)