xxxxxxxxxx
# You want a regression line : y = mx + c
from statsmodels.formula.api import ols
# Formula applied : y = x, finds m and c on its own for best fit
num_model = ols("y ~ x_num", data=df).fit()
# Calculate co-efficient of each category with relative to 0 instead of relative to intercept, its the mean for each category
cat_model = ols("y ~ x_cat + 0", data=df).fit()
# See model params : slope m and intercept c
print(cat_model.params)
# See prediction on original fitted data
print(num_model.fittedvalues)
# See residuals
print(model.resid)
# Model summary
model.summary()
# R-squared
print(model.rsquared)
# Residual mean squared error
rse = np.sqrt(model.mse_resid)
# Create test data
test_data = pd.DataFrame({"x_num": np.arange(20, 41)})
# Predict on test data
print(num_model.predict(test_data))
# Summary values
summary_df = model.get_influence().summary_frame()
# Leverage
df["leverage"] = summary_df["hat_diag"]
# Cooks distance
summary_df["cooks_dist"] = summary_df["cooks_d"]
# Residual plot
sns.residplot(x="X", y="y", data=bream, lowess=True)
# QQ plot
from statsmodels.api import qqplot
qqplot(data=model.resid, fit=True, line="45")
# Scale location plot
residual_abs_squared = model.get_influence().resid_studentized_internal
residual_measured = np.sqrt(residual_abs_squared)
sns.regplot(x=model.fittedvalues, y=residual_measured, ci=None, lowess=True)
# NOTE : You can transform the X and y before fitting and then train. You can predict with the model and the predicted values should be back-transformed for y if y was transformed.
xxxxxxxxxx
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Construct model
lm=LinearRegression()
# Simple linear regression uses 1 column with eqn: y = mx + c
# Multiple linear regression uses multiple columns with eqn: z = mx + ny +c
# Fit the model
lm.fit(X_train, y_train)
# Predicted estimation
y_pred = lm.predict(X_test)
# This is intercept of the line (Also known as bias co-efficient)
intercept = lm.intercept_
# This is slope (m) of the line y=mx+c (Also known as relevant variable's co-efficient)
slope = lm.coef_
# Percentage of target values explained by the features
rsquared = lm.score(X_test, y_test)
# RMSE : Average error in prediction
mean_squared_error(y_test, y_pred, squared=False)
# Prediction of specific range
new_x = np.arange(1,101,1).reshape(-1,1) # Or you can make it dataframe
new_pred_y = lm.predict(new_x)
# Do k-fold cross validation
kf = KFold(n_splits=6, shuffle=True, random_state=42)
cv_results = cross_val_score(lm, X_train, y_train, cv=kf)
# Mean, std and confidence interval of the cross-validation
print(np.mean(cv_results), np.std(cv_results), np.quantile(cv_results, [0.025, 0.975]))
# Visualize Feature importance
names = df.drop("target", axis=1).columns
importance = lm.fit(X, y).coef_
plt.bar(names, importance)
plt.xticks(rotation=45)
plt.show()
xxxxxxxxxx
import seaborn as sb
from matplotlib import pyplot as plt
df = sb.load_dataset('tips')
sb.regplot(x = "total_bill", y = "tip", data = df)
plt.show()
xxxxxxxxxx
>>> from scipy import stats
>>> import numpy as np
>>> x = np.random.random(10)
>>> y = np.random.random(10)
>>> slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
xxxxxxxxxx
import numpy as np
import matplotlib.pyplot as plt
def estimate_coef(x, y):
# number of observations/points
n = np.size(x)
# mean of x and y vector
m_x = np.mean(x)
m_y = np.mean(y)
# calculating cross-deviation and deviation about x
SS_xy = np.sum(y*x) - n*m_y*m_x
SS_xx = np.sum(x*x) - n*m_x*m_x
# calculating regression coefficients
b_1 = SS_xy / SS_xx
b_0 = m_y - b_1*m_x
return (b_0, b_1)
def plot_regression_line(x, y, b):
# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m",
marker = "o", s = 30)
# predicted response vector
y_pred = b[0] + b[1]*x
# plotting the regression line
plt.plot(x, y_pred, color = "g")
# putting labels
plt.xlabel('x')
plt.ylabel('y')
# function to show plot
plt.show()
def main():
# observations / data
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
# estimating coefficients
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {} \
\nb_1 = {}".format(b[0], b[1]))
# plotting regression line
plot_regression_line(x, y, b)
if __name__ == "__main__":
main()