# generate base summary df using pd describe() function.
summary_df = df.describe() # www.athulmathew.com
# dict intitialisation.
myu_sigma_dict = dict()
myupsigma_dict = dict()
outlier_dict = dict()
outlier_pct_dict = dict()
# index column defenition.
myu_sigma_dict["index"] = "µ-3σ"
myupsigma_dict["index"] = "µ+3σ"
outlier_dict["index"] = "outlier count"
outlier_pct_dict["index"] = "outlier(%)"
for col in summary_df.columns:
myu = summary_df[col]["mean"]
sigma = summary_df[col]["std"]
myu_sigma_dict[col] = (myu - 3*sigma)
myupsigma_dict[col] = (myu + 3*sigma)
outlier_dict[col] = Df[(Df[col] < (myu - 3*sigma)) | (Df[col] > (myu + 3*sigma))].shape[0]
outlier_pct_dict[col] = outlier_dict[col]*100/summary_df[col]["count"]
# replace outliers with (myu - 3*sigma) and (myu + 3*sigma).
Df.loc[Df[col] < (myu - 3*sigma), col] = (myu - 3*sigma)
Df.loc[Df[col] > (myu + 3*sigma), col] = (myu + 3*sigma)
# add the newly created metrics to summary df.
summary_df = summary_df.reset_index()
summary_df = summary_df.append( myu_sigma_dict , ignore_index=True)
summary_df = summary_df.append( myupsigma_dict , ignore_index=True)
summary_df = summary_df.append( outlier_dict, ignore_index=True)
summary_df = summary_df.append( outlier_pct_dict, ignore_index=True)
summary_df