import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)
# Simple fit-predict
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123) # Classification
xg_reg = xgb.XGBRegressor(objective='reg:linear', n_estimators=10, seed=123) # Regression
model.fit(X_train, y_train)
preds = model.predict(X_test)
# Cross validation (Method 1 : Using the xgboost API, it has cv, train, predict which is unlike fit-predict in sklearn)
dmatrix = xgb.DMatrix(data=X_train, label=y_train)
params_clf={"objective":"binary:logistic","max_depth":4} # Classification parameters
params_reg={"objective":"binary:logistic","booster":"gblinear"} # Regression parameters with specified base learners
# Regularization parameters: "alpha" for l1, "lambda" for l2, "gamma" for penalty weight for splitting on a node according to tree complexity
cv_results = xgb.cv(dtrain=dmatrix, params=params_clf_reg, nfold=4, num_boost_round=10,
metrics="error", as_pandas=True, stratified=True, early_stopping_rounds=10, verbose_eval=1)
# accuracy_cv = 1 - cv_results['test-error-mean'].iloc[-1]
# Train the final model with the best number of boosting rounds
best_num_boost_round = len(cv_results)
final_model = xgb.train(params = params_clf_reg, dtrain = dmatrix, num_boost_round=best_num_boost_round)
# Make predictions on the testing dataset
dtest = xgb.DMatrix(X_test) # ,y_test
y_pred_prob = final_model.predict(dtest)
y_pred_binary = np.round(y_pred_prob) # Convert probabilities to binary predictions
accuracy_final = accuracy_score(y_test, y_pred_binary)
# Cross validation (Method 2 : Using scikit-learn)
from sklearn.model_selection import cross_val_score, StratifiedKFold
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='accuracy')
from sklearn.model_selection import cross_val_predict
y_pred_cv = cross_val_predict(xgb_model, X_test, y=None, cv=cv)
accuracy_final = accuracy_score(y_test, y_pred_cv)
# GridSearch / RandomizedSearch (HYPERPARAMETER TUNING)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_grid = {'learning_rate': np.arange(0.05,1.05,.05), 'n_estimators': [200], 'subsample': np.arange(0.05,1.05,.05)}
gbm = xgb.XGBRegressor()
tuning_models = Grid_RandomizedSearchCV(estimator=gbm, param_distributions=param_grid, n_iter=25,
scoring='neg_mean_squared_error', cv=4, verbose=1)
tuning_models.fit(X, y)
tuning_models.best_params_ # See the parameters that give the best results
# Visualize tree
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin' ### MAKE SURE TO INSTALL GRAPHVIZ AND ADD THE INSTALLATION PATH
xgb.plot_tree(xg_model, num_trees=0) # , rankdir="LR" for aligning tree sideways from left to right