xxxxxxxxxx
cols = ['col_1', 'col_2'] # one or more
Q1 = df[cols].quantile(0.25)
Q3 = df[cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
xxxxxxxxxx
#------------------------------------------------------------------------------
# accept a dataframe, remove outliers, return cleaned data in a new dataframe
# see http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm
#------------------------------------------------------------------------------
def remove_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low = q1-1.5*iqr
fence_high = q3+1.5*iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
xxxxxxxxxx
# Solution is based on this article:
# http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm
import pandas as pd
import numpy as np
def remove_outliers_from_series(series):
q1 = series.quantile(0.25)
q3 = series.quantile(0.75)
intraquartile_range = q3 - q1
fence_low = q1 - 1.5 * intraquartile_range
fence_high = q3 + 1.5 * intraquartile_range
return series[(series > fence_low) & (series < fence_high)]
def remove_outliers_from_dataframe(self, df, col):
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
intraquartile_range = q3 - q1
fence_low = q1 - 1.5 * intraquartile_range
fence_high = q3 + 1.5 * intraquartile_range
return df.loc[(df[col] > fence_low) & (df[col] < fence_high)]
def remove_outliers_from_np_array(self, arr):
q1 = np.percentile(arr, 25)
q3 = np.percentile(arr, 75)
intraquartile_range = q3 - q1
fence_low = q1 - 1.5 * intraquartile_range
fence_high = q3 + 1.5 * intraquartile_range
return arr[(arr > fence_low) & (arr < fence_high)]
def remove_outliers_from_python_list(self, _list):
arr = np.array(_list)
return list(remove_outliers_from_np_array(arr))
def remove_outliers(*args, **kwargs):
if isinstance(args[0], pd.DataFrame):
return remove_outliers_from_dataframe(*args, **kwargs)
elif isinstance(args[0], pd.Series):
return remove_outliers_from_series(*args, **kwargs)
elif isinstance(args[0], np.ndarray):
return remove_outliers_from_np_array(*args, **kwargs)
elif isinstance(args[0], list):
return remove_outliers_from_python_list(*args, **kwargs)
else:
raise TypeError(f'{type(args[0])} is not supported.')
xxxxxxxxxx
df = pd.DataFrame(np.random.randn(100, 3))
from scipy import stats
df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
xxxxxxxxxx
df = pd.DataFrame(np.random.randn(100, 3))
from scipy import stats
df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
xxxxxxxxxx
# Method 1
def remove_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low = q1-1.5*iqr
fence_high = q3+1.5*iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
# Method 2
def remove_outliers(df_in, col_name):
mean = df_in[col_name].mean()
std = df_in[col_name].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
df_out = df_in[(df_in[col_name] < upper) & (df_in[col_name] > lower)]
return df_out
# Method 3 : Not recommended
def trim_outliers(df_in, col_name, quantile_value=0.95):
quantile = df_in[col_name].quantile(quantile_value)
df_out = df_in[df_in[col_name] < quantile]
return df_out
xxxxxxxxxx
You have to define the range of values in that paticular column.
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
There is no direct code for it.
xxxxxxxxxx
df[(df["col"] >= x ) & (df["col"] <= y )]
but it's more readable to use:
df[df["col"].between(x,y)]