xxxxxxxxxx
# Import the necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)
# Create the classifier: logreg
logreg = LogisticRegression()
# Fit the classifier to the training data
logreg.fit(X_train, y_train)
# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)
# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
xxxxxxxxxx
# import the class
from sklearn.linear_model import LogisticRegression
# instantiate the model (using the default parameters)
logreg = LogisticRegression()
# fit the model with data
logreg.fit(X_train,y_train)
#
y_pred=logreg.predict(X_test)
xxxxxxxxxx
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix
xxxxxxxxxx
from statsmodels.formula.api import logit
model = logit("target ~ x_var", data=df).fit()
print(model.params)
# Visualize logistic model
sns.regplot(x="x_var", y="target", data=df, ci=None, logistic=True)
X_test = pd.DataFrame({"x_var": np.arange(-1, 6.25, 0.25)})
y_pred_prob = model.predict(X_test)
y_pred = np.round(y_pred_prob)
# Odds ratio : p/(1-p) or probability of something happenning over not happening
odds_ratio = y_pred_prob / (1- y_pred_prob)
# Visualize odds ratio / log odds ratio
sns.lineplot(x="x_var", y="odds_ratio", data=df)
plt.axhline(y=1, linestyle="dotted")
plt.yscale("log") # If you want to make the curve into linear make y : np.log(odds_ratio)
plt.show()
# Confusion matrix
conf_matrix = model.pred_table()
TN = conf_matrix[0,0]
TP = conf_matrix[1,1]
FN = conf_matrix[1,0]
FP = conf_matrix[0,1]
# Visualize confusion matrix
from statsmodels.graphics.mosaicplot import mosaic
mosaic(conf_matrix)
xxxxxxxxxx
- It is a classification problem
- It is called regression because it calculates continuous value (sigmoid / likelihood /probability)
- It compares threshold value with probability to reach conclusion
- It works well for data with 2 class that is linearly separable by a line (decision boundary)
- For multiple classes, use softmax
- Logistic regression only calculates probability of positive (target) class only. subtracting from 1 will give us probability of negative class
- Training process:
1. Initializes theta (co-efficients) with random values
2. Calculate sigmoid (probability) of output for a case
3. compare this probability with actual output and record this difference as error
4. Calculate this error for all training cases. Total error is the cost of the model (known as cost function. eg: MSE)
5. Change theta in such a way that it reduces total cost (optimization algorithm like gradient descent)
6. Iterate from step 2 until the model is satisfactory enough with low cost.