template - xgboost

import xgboost

import numpy as np

#example data for X and y

y = np.random.randint(0,2,1000)

X = np.random.rand(y.size, 3)

X_pred = np.random.rand(10, 3) #the X for testing performance

#training data in xgboost's own Matrix format

feature_names = [f'col{i}' for i in range(X.shape[1])]

xgtrain = xgboost.DMatrix(X, y, feature_names = feature_names)

xgtest = xgboost.DMatrix(data = X_pred, feature_names = feature_names)

# Evaluation metric: auc, map, map@n, aucpr, ndcg, ndcg@n

# The corresponding cross validataion cv_metric_name = 'test-auc-mean'

eval_metric = 'auc'

cv_metric_name = 'test-auc-mean'

##### booster parameters #####

# learning_rate: the step-size of shrinkage. when adding an additional tree f(x), we usually add only a small portion of the f(x)

# which means we dont do full optimization when adding a new tree, but reserve chance for future rounds

# this helps reduce overfitting. The smaller learning rate, the more conservative the training is. 0.01 ~ 0.2 can be good.

# min_child_weight: Defines the minimum sum of weights of all observations required in a child. default 1.

# Used to control over-fitting. Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.

# gamma: the minimum loss reduction required to make a further leaf split. It has to be at least 0.

# The values can vary depending on the loss function and should be tuned.

# max_depth: the maximum depth of the trees.

# Small trees are preferred because complex tree structure with many leaves tend to overfit the training data.

# usually hundreds of small trees is better than dozens of large trees. 3-10. Better use 3~6.

# max_leaves: similar to max_depth, but constrain the tree size of leaves.

# Don't need to use this. Use the max_depth instead.

# subsample: When constructing an additional tree, don't use the full training data set. Instead use a subset of randomly selected training data.

# colsample_bytree: similar to subsample, this selects a random subset of features (columns) to construct an additional tree.

# colsample_bylevel: percentage of traning data selected at each level split. No need to use this parameter if subsample & colsample_bytree are used already.

# lambda: The weight of the L2 regularization term

# alpha: the weight of the L1 regularization term

# objective: "reg:linear" --linear regression

# "reg:logistic" --logistic regression

# "binary:logistic" --logistic regression for binary classification, output probability

''' Note: the max_depth and min_child_weight have the highest impact on model outcome. tune the two parameters first and then the rest.'''

parameters = {

'booster': 'gbtree', #gbtree, gblinear, dart

'objective': 'binary:logistic', #log loss as objective function

'eval_metric': eval_metric, #auc

'base_score': 0.5, #starting score

'eta': 0.1, #set to 0.1, relatively high value, will try 0.01 later

'gamma': 0, #minimum loss requirement for leaf split

'max_depth': 5, #set to 5, may try 3 ~ 10

'min_child_weight': 1, #default 1

'max_delta_step': 0,

'subsample': 0.8, #start with 80% sampling rate

'colsample_bytree': 0.8,#start with 80% sampling rate

'colsample_bylevel': 1,

'lambda': 1,

'alpha': 0.1,

'scale_pos_weight': 1, #ratio #negative/#positive , default 1

'nthread': 4, #4 core cpu

'seed': 88,

'silent': 1

}

#### rounds of parameter tunning #####

'''grid search with the built-in xgboost.cv so cross validation at each iteration'''

best_score = 0

for eta in [0.05, 0.04, 0.03, 0.02]:

for max_depth in [4, 5, 6]:

for min_child_weight in [3,2,1.5,1]:

parameters['eta'] = eta

parameters['max_depth'] = max_depth

parameters['min_child_weight'] = min_child_weight

cv_result = xgboost.cv( #run cross validation with the given parameters

parameters,

xgtrain,

num_boost_round = 100, #leave it a big number

nfold=5, #10 fold cross validation

early_stopping_rounds = 20 #if no improvement, stop the iteration early

)

#keep the best score as it goes through the grid search

if cv_result[cv_metric_name][cv_result.shape[0]-1] > best_score:

best_score = cv_result[cv_metric_name][cv_result.shape[0]-1]

best_eta= eta

best_max_depth = max_depth

best_min_child_weight = min_child_weight

best_boosting_rounds = cv_result.shape[0] #the number of trees

print('====eta:{0}, max depth:{1}, min child weight:{2} --- best rounds: {3} score: {4}'.format(eta,max_depth,min_child_weight,cv_result.shape[0], cv_result[cv_metric_name][cv_result.shape[0]-1]))

print('the best {0} is: {1}'.format(eval_metric, best_score))

print(' boosting rounds {0}'.format(best_boosting_rounds))

print(' learning reate {0}'.format(best_eta))

print(' max depth {0}'.format(best_max_depth))

print(' min child weight {0}'.format(best_min_child_weight))

parameters['eta'] = best_eta

parameters['max_depth'] = best_max_depth

parameters['min_child_weight'] = best_min_child_weight

'''grid search for gamma'''

best_score = 0

for gamma in [i/10.0 for i in range(0,6)]:

parameters['gamma'] = gamma

cv_result = xgboost.cv(

parameters,

xgtrain,

num_boost_round = 100,

nfold=5,

early_stopping_rounds = 20

)

if cv_result[cv_metric_name][cv_result.shape[0]-1] > best_score:

best_score = cv_result[cv_metric_name][cv_result.shape[0]-1]

best_gamma = gamma

best_boosting_rounds = cv_result.shape[0]

print('====gamma:{0} --- best rounds: {1} score: {2}'.format(gamma,cv_result.shape[0], cv_result[cv_metric_name][cv_result.shape[0]-1]))

print('the best {0} is: {1}'.format(eval_metric, best_score))

print(' boosting rounds {0}'.format(best_boosting_rounds))

print(' gamma {0}'.format(best_gamma))

parameters['gamma'] = best_gamma

'''grid search for subsample and colsample_bytree'''

best_score = 0

for subsample in [0.6,0.7,0.8,0.9]:

for colsample_bytree in [0.6,0.7,0.8,0.9]:

#for colsample_bylevel in [0.8, 0.9, 1.0]:

parameters['subsample'] = subsample

parameters['colsample_bytree'] = colsample_bytree

cv_result = xgboost.cv(

parameters,

xgtrain,

num_boost_round = 100,

nfold=5,

early_stopping_rounds = 20

)

if cv_result[cv_metric_name][cv_result.shape[0]-1] > best_score:

best_score = cv_result[cv_metric_name][cv_result.shape[0]-1]

best_subsample = subsample

best_colsample_bytree = colsample_bytree

best_boosting_rounds = cv_result.shape[0]

print('====subsample:{0}, colsample_bytree:{1} --- best rounds: {2} score: {3}'.format(subsample,colsample_bytree,cv_result.shape[0], cv_result[cv_metric_name][cv_result.shape[0]-1]))

print('the best {0} is: {1}'.format(eval_metric, best_score))

print(' boosting rounds {0}'.format(best_boosting_rounds))

print(' subsample {0}'.format(best_subsample))

print(' colsample_bytree {0}'.format(best_colsample_bytree))

parameters['subsample'] = best_subsample

parameters['colsample_bytree'] = best_colsample_bytree

'''grid search for L1 and L2 regularization'''

best_score = 0

for reg_alpha in [0.1, 0.2, 0.5, 1]:

for reg_lambda in [0.5, 1, 2, 3]:

parameters['alpha'] = reg_alpha

parameters['lambda'] = reg_lambda

cv_result = xgboost.cv(

parameters,

xgtrain,

num_boost_round = 100,

nfold=5,

early_stopping_rounds = 20

)

if cv_result[cv_metric_name][cv_result.shape[0]-1] > best_score:

best_score = cv_result[cv_metric_name][cv_result.shape[0]-1]

best_alpha = reg_alpha

best_lambda = reg_lambda

best_boosting_rounds = cv_result.shape[0]

print('====alpha:{0}, lambda:{1} --- best rounds: {2} score: {3}'.format(reg_alpha,reg_lambda,cv_result.shape[0], cv_result[cv_metric_name][cv_result.shape[0]-1]))

print('the best {0} is: {1}'.format(eval_metric, best_score))

print(' boosting rounds {0}'.format(best_boosting_rounds))

print(' alpha {0}'.format(best_alpha))

print(' lambda {0}'.format(best_lambda))

parameters['alpha'] = best_alpha

parameters['lambda'] = best_lambda

'''refit the model with the best parameters'''

best_booster = xgboost.train(

params = parameters

,dtrain = xgtrain

,num_boost_round=best_boosting_rounds

)

y_pred = best_booster.predict(xgtest)

print(y_pred)

Page updated

Google Sites

Report abuse