# Check missing data
df.isna().any()
df.isna().sum()
# Visualize missing data information
import missingno as msno
import matplotlib.pyplot as plt
msno.matrix(df)
plt.show()
# Drop missing data column
df_dropped = df.dropna(subset = ['col'], axis = 1) # 0 for row
df.dropna(axis = 0) # Drop entire row for missing value (default)
df.dropna(axis = 1) # Drop entire column for missing value
# Replace/impute missing data with single value
col_mean = df['col'].mean()
df_imputed = df.fillna({'col': col_mean})
df.fillna(method='bfill')
df.interpolate(method='linear')
df['col'].replace(to_replace=np.nan, value = some_mean,inplace = True) # Alternative
# Replace/impute missing data with series
series_imp = df['col1'] * 5
df_imputed = df.fillna({'col2':series_imp})
df["col"].value_counts() # Look out for suspicious values
##### Strategic dropping example ########
# Drop missing values where <= 5% of data in column are missing , otherwise impute values
threshold = len(df) * 0.05
cols_to_drop = df.columns[df.isna().sum() <= threshold]
df.dropna(subset=cols_to_drop, inplace=True)
cols_with_missing_values = df.columns[salaries.isna().sum() > 0]
for col in cols_with_missing_values[:-1]:
df[col].fillna(df[col].mode()[0])
subgroup_dict = df.groupby("cat_col")["num_col"].median().to_dict()
df["num_col"] = df["num_col"].fillna(df["cat_col"].map(subgroup_dict))