# Binarizing (create column with value to 0 or 1)
from pyspark.ml.feature import Binarizer
df = df.withColumn('val', df['val'].cast('double'))
bin = Binarizer(threshold=0.0, inputCol='val', outputCol='binary_col')
df = bin.transform(df)
# Bucketing
from pyspark.ml.feature import Bucketizer
splits = [0, 1, 2, 3, 4, float('Inf')]
# Create bucketing transformer
buck = Bucketizer(splits=splits, inputCol='BATHSTOTAL', outputCol='baths')
# Apply transformer
df = buck.transform(df)
# One-hot encoding (Can be used with PYSPARK PIPELINE and PYSPARK MACHINE LEARNING model)
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
string_indexer = StringIndexer(inputCol='cat_col', outputCol='Cat_Index') # Map strings to numbers with string indexer
indexed_df = string_indexer.fit(df).transform(df)
encoder = OneHotEncoder(inputCol='Cat_Index', outputCol='Onehot_feature') # Onehot encode indexed values
encoded_df = encoder.fit(indexed_df).transform(indexed_df)
# Using Pipeline to do many steps at once
from pyspark.ml import Pipeline
features_cols = list(df.columns) # Check for non-null columns
features_cols.remove('some_null_col') # Remove the dependent variable from the list
df = df.fillna(-1) # Vector Assembler should not take in any nulls
vec_assembler = VectorAssembler(inputCols=["feature1", "feature2", "Onehot_feature"], outputCol="features") # features_cols
pipeline = Pipeline(stages=[string_indexer, encoder, vec_assembler]) # Last stage is model: eg : stages=[.., model]
pipeline_model = pipeline.fit(df)
transformed_df = pipeline_model.transform(df)
# SPLIT DATA
# Create Model
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features", labelCol="SALESCLOSEPRICE",
predictionCol="Prediction_Price", seed=42 )
model = rf.fit(train_df) # Train model
predictions = model.transform(test_df)
model.save('rfr_model') # Save model
from pyspark.ml.regression import RandomForestRegressionModel
model2 = RandomForestRegressionModel.load('rfr_model') # Load the model
# Evaluate Model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="SALESCLOSEPRICE", predictionCol="Prediction_Price")
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
# Feature importance
import pandas as pd
# Convert feature importances to a pandas column
importance_df = pd.DataFrame(model.featureImportances.toArray(), columns=['importance'])
importance_df['features'] = pd.Series(feature_cols) # Create a new column to hold feature names
importance_df.sort_values(by=['importance'], ascending=False, inplace=True) # Sort the data based on feature importance