# No of missing values
df.where(df['col_name'].isNull()).count()
# Visualise missing values with heatmap
pandas_df = spark_df.toPandas()
sns.heatmap(data=pandas_df.isnull())
# Drop any records with NULL values
df = df.dropna()
# drop records if both LISTPRICE and SALESCLOSEPRICE are NULL
df = df.dropna(how='all', subset['col1', 'col2 '])
# Drop records where at least two columns have NULL values
df = df.dropna(thresh=2)
# Drop columns with >30% missing values
df = df.drop(*col_list)
# Replace missing values
col_mean = df.agg({'col_name': 'mean'}).collect()[0][0]
df.fillna(col_mean, subset=['col_name'])
# Drop duplicates
df.dropDuplicates(['col_name'])