# Visualize distribution
sns.pairplot(df)
df[['column_1']].boxplot()
plt.show()
# One-hot encoding
pd.get_dummies(df, columns=['cat'], prefix='C')
# Dummy encoding
pd.get_dummies(df, columns=['cat'], drop_first=True, prefix='C')
# Merging low frequency categorical counts
counts = df['cat'].value_counts()
mask = df['cat'].isin(counts[counts < 5].index)
df['cat'][mask] = 'Other'
# Binarizing numeric variables
df['Binary_col'] = 0
df.loc[df['Number_col'] > 0, 'Binary_col'] = 1
import numpy as np
df['Binned_Group'] = pd.cut( df['Number_col'], bins=[-np.inf, 0, 2, np.inf], labels=[1, 2, 3])
# SCALE / STANDARDIZE DATA
# DEAL WITH MISSING VALUES.....
# DEAL WITH OUTLIERS
# Validate numeric columns
df['RawSalary'] = df['RawSalary'].str.replace(',', '').astype('float')
coerced_vals = pd.to_numeric(df['RawSalary'], errors='coerce')
print(df[coerced_vals.isna()].head()) # Sanity check which values still show errors
### Automatic feature generation from related tables
import featuretools as ft
es = ft.EntitySet(id="my_data") # Define databbase
# Define tables
es = es.entity_from_dataframe(entity_id="customers", dataframe=customers_df, index="customer_id"
es = es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_df, index="transaction_id", time_index=None)
# Define relationship
relationship = ft.Relationship(parent_variable=es["customers"]["customer_id"],
child_variable=es["transactions"]["customer_id"])
es = es.add_relationship(relationship)
# Run Deep Feature Synthesis
features, feature_names = ft.dfs(entityset=es, target_entity="customers")
print(features) # Display the generated features
### Time series feature generation
from tsfresh.spark import extract_features
extracted_features = extract_features(df, column_id="id_col", column_sort="time_col", column_value="value_col")
extracted_features.show() # Display the extracted features