# Split into train and test set
from sklearn.model_selection import train_test_split
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=0.3, random_state=3)
# Make sure to take into account the class imbalance
from sklearn.utils.class_weight import compute_sample_weight
w_train = compute_sample_weight('balanced', y_train)
# Train the classifier
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
tree_clf.fit(X_Train,y_Train, sample_weight=w_train)
# Alternative approach : Train the classifier with snapml (offers multi-threaded CPU/GPU training)
from snapml import DecisionTreeClassifier
snapml_dt_gpu = DecisionTreeClassifier(max_depth=4, random_state=45, use_gpu=True)
snapml_dt_cpu = DecisionTreeClassifier(max_depth=4, random_state=45, n_jobs=4)
snapml_dt.fit(X_train, y_train, sample_weight=w_train)
# Predict
y_pred = tree_clf.predict(X_Test)
### Inspecting a random forest
# Pull out one tree from the forest (If decision tree is a random forest)
chosen_tree = randomforest_model.estimators_[7] # You can visualize it with (graphviz & pydotplus)
# Extract node decisions
split_column = chosen_tree.tree_.feature[0] # Get the first column it split on
split_column_name = X_train.columns[split_column] # Name of the column
split_value = chosen_tree.tree_.threshold[1] # Get the theshold value it split on
# Compute predicted probabilities
y_pred_prob = tree_clf.predict_proba(X_test)[:,1]
# Evaluate tree
from sklearn.metrics import roc_auc_score, accuracy_score
accuracy_score(y_testset, predTree)
roc_auc_score(y_test, y_pred)
# Visualize the graph using plot_tree
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 10))
plot_tree(chosen_tree, feature_names=X_train.columns, filled=True, rounded=True, fontsize=10)
plt.show()