xxxxxxxxxx
df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
'numeric': [1, 2, 3],
'object': ['a', 'b', 'c']
})
>>> df.describe()
numeric
count 3.0
mean 2.0
std 1.0
min 1.0
25% 1.5
50% 2.0
75% 2.5
max 3.0
pandas describe without outliers
xxxxxxxxxx
def drop_series_outliers(serie):
"""
Drop outliers from a pandas series.
Args:
serie (pd.Series): Series to drop outliers from.
Returns:
pd.Series: Series without outliers in it
"""
q_1 = serie.quantile(0.25)
q_3 = serie.quantile(0.75)
iqr = q_3 - q_1 # Interquartile range
low = q_1 - 1.5 * iqr
high = q_3 + 1.5 * iqr
return serie.loc[(serie > low) & (serie < high)]
def describe(data, *args, drop_outliers=True, **kwargs):
"""
Describe a pd.DataFrame with some useful information.
Args:
data (pd.DataFrame): DataFrame to describe.
*args: Arguments to pass to pd.DataFrame.describe.
**kwargs: Keyword arguments to pass to pd.DataFrame.describe.
Returns:
pd.DataFrame: DataFrame with the description.
"""
desc = data.describe(*args, **kwargs).T
cols = [
"count",
"nulls",
"nulls%",
"unique",
"unique%",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
"dtype",
"memory",
"memory%",
]
# if data should be cleaned from outliers
if drop_outliers:
wo_outliers = pd.DataFrame(
{
name: values.pipe(drop_series_outliers).describe(*args, **kwargs).values
for name, values in data.iteritems()
},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
).T
desc["outliers"] = desc["count"] - wo_outliers["count"]
desc["outliers%"] = desc.outliers / desc["count"]
desc["ovr_min"] = desc["min"]
desc["ovr_max"] = desc["max"]
desc["mean"] = wo_outliers["mean"]
desc["std"] = wo_outliers["std"]
desc["min"] = wo_outliers["min"]
desc["25%"] = wo_outliers["25%"]
desc["50%"] = wo_outliers["50%"]
desc["75%"] = wo_outliers["75%"]
desc["max"] = wo_outliers["max"]
cols = [
"count",
"nulls",
"nulls%",
"outliers",
"outliers%",
"unique",
"unique%",
"mean",
"std",
"ovr_min",
"min",
"25%",
"50%",
"75%",
"ovr_max",
"max",
"dtype",
"memory",
"memory%",
]
desc["nulls"] = data.isnull().sum()
desc["nulls%"] = desc["nulls"] / len(data)
desc["unique"] = data.nunique()
desc["unique%"] = desc["unique"] / len(data)
desc["dtype"] = data.dtypes
desc["memory"] = data.memory_usage(deep=True)
desc["memory%"] = desc["memory"] / desc["memory"].sum()
return desc[cols]
df.pipe(describe)
An enhanced describe function without outliers for pandas