#### FOR Classification ###########
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
# Split the dataset into 80% train, 20% test
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
# Instantiate the Classification Tree
cl_dt = DecisionTreeClassifier(max_depth=2, random_state=42, criterion='gini')
# Train the model
cl_dt.fit(X_train,y_train)
# Predict using test set
y_pred = cl_dt.predict(X_test)
# Evaluate the test set accuracy
accuracy_score(y_test, y_pred)
# To check for model overfitting, compare this with test set log loss
# Compute negative log loss
neg_log_loss_cv = -cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_log_loss', n_jobs=-1)
#### FOR Regression ###########
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_score
# Split the dataset into 80% train, 20% test
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)
# Instantiate the Regression Tree
reg_dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.1, random_state=3)
# Train with training data
reg_dt.fit(X_train, y_train)
# Predict
y_pred = reg_dt.predict(X_test)
# Compute RMSE for testing data
mse_reg_dt = MSE(y_test, y_pred)
rmse_reg_dt = mse_reg_dt**(1/2)
print(rmse_reg_dt)
# To check for model overfitting, compare this with test set MSE
MSE_CV = - cross_val_score(dt, X_train, y_train, cv= 10, scoring='neg_mean_squared_error', n_jobs = -1)
rmse_cv = MSE_CV**(1/2)