Breaking News: Grepper is joining You.com. Read the official announcement!

pyspark feature engineering

Innocent Iguana answered on February 27, 2024 Popularity 6/10 Helpfulness 7/10

answer pyspark feature engineering

pyspark feature engineering

Comment

Tip Innocent Iguana 1 GREPCC

# Binarizing (create column with value to 0 or 1)
from pyspark.ml.feature import Binarizer
df = df.withColumn('val', df['val'].cast('double'))
bin = Binarizer(threshold=0.0, inputCol='val', outputCol='binary_col')
df = bin.transform(df)

# Bucketing 
from pyspark.ml.feature import Bucketizer
splits = [0, 1, 2, 3, 4, float('Inf')]
# Create bucketing transformer
buck = Bucketizer(splits=splits, inputCol='BATHSTOTAL', outputCol='baths')
# Apply transformer
df = buck.transform(df)

# One-hot encoding (Can be used with PYSPARK PIPELINE and PYSPARK MACHINE LEARNING model)
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
string_indexer = StringIndexer(inputCol='cat_col', outputCol='Cat_Index') # Map strings to numbers with string indexer
indexed_df = string_indexer.fit(df).transform(df)
encoder = OneHotEncoder(inputCol='Cat_Index', outputCol='Onehot_feature') # Onehot encode indexed values
encoded_df = encoder.fit(indexed_df).transform(indexed_df)

# Using Pipeline to do many steps at once
from pyspark.ml import Pipeline
features_cols = list(df.columns) # Check for non-null columns
features_cols.remove('some_null_col') # Remove the dependent variable from the list
df = df.fillna(-1) # Vector Assembler should not take in any nulls
vec_assembler = VectorAssembler(inputCols=["feature1", "feature2", "Onehot_feature"], outputCol="features") # features_cols
pipeline = Pipeline(stages=[string_indexer, encoder, vec_assembler]) # Last stage is model: eg : stages=[.., model]
pipeline_model = pipeline.fit(df)
transformed_df = pipeline_model.transform(df)

# SPLIT DATA

# Create Model
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features", labelCol="SALESCLOSEPRICE",
                    predictionCol="Prediction_Price", seed=42 )
model = rf.fit(train_df) # Train model
predictions = model.transform(test_df)
model.save('rfr_model') # Save model
from pyspark.ml.regression import RandomForestRegressionModel
model2 = RandomForestRegressionModel.load('rfr_model') # Load the model
# Evaluate Model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="SALESCLOSEPRICE", predictionCol="Prediction_Price")
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Feature importance
import pandas as pd
# Convert feature importances to a pandas column
importance_df = pd.DataFrame(model.featureImportances.toArray(), columns=['importance'])
importance_df['features'] = pd.Series(feature_cols) # Create a new column to hold feature names
importance_df.sort_values(by=['importance'], ascending=False, inplace=True) # Sort the data based on feature importance

xxxxxxxxxx

# Binarizing (create column with value to 0 or 1)

from pyspark.ml.feature import Binarizer

df = df.withColumn('val', df['val'].cast('double'))

bin = Binarizer(threshold=0.0, inputCol='val', outputCol='binary_col')

df = bin.transform(df)

# Bucketing

from pyspark.ml.feature import Bucketizer

splits = [0, 1, 2, 3, 4, float('Inf')]

# Create bucketing transformer

buck = Bucketizer(splits=splits, inputCol='BATHSTOTAL', outputCol='baths')

# Apply transformer

df = buck.transform(df)

# One-hot encoding (Can be used with PYSPARK PIPELINE and PYSPARK MACHINE LEARNING model)

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

string_indexer = StringIndexer(inputCol='cat_col', outputCol='Cat_Index') # Map strings to numbers with string indexer

indexed_df = string_indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCol='Cat_Index', outputCol='Onehot_feature') # Onehot encode indexed values

encoded_df = encoder.fit(indexed_df).transform(indexed_df)

# Using Pipeline to do many steps at once

from pyspark.ml import Pipeline

features_cols = list(df.columns) # Check for non-null columns

features_cols.remove('some_null_col') # Remove the dependent variable from the list

df = df.fillna(-1) # Vector Assembler should not take in any nulls

vec_assembler = VectorAssembler(inputCols=["feature1", "feature2", "Onehot_feature"], outputCol="features") # features_cols

pipeline = Pipeline(stages=[string_indexer, encoder, vec_assembler]) # Last stage is model: eg : stages=[.., model]

pipeline_model = pipeline.fit(df)

transformed_df = pipeline_model.transform(df)

# SPLIT DATA

# Create Model

from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(featuresCol="features", labelCol="SALESCLOSEPRICE",

                    predictionCol="Prediction_Price", seed=42 )

model = rf.fit(train_df) # Train model

predictions = model.transform(test_df)

model.save('rfr_model') # Save model

from pyspark.ml.regression import RandomForestRegressionModel

model2 = RandomForestRegressionModel.load('rfr_model') # Load the model

# Evaluate Model

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="SALESCLOSEPRICE", predictionCol="Prediction_Price")

rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})

r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Feature importance

import pandas as pd

# Convert feature importances to a pandas column

importance_df = pd.DataFrame(model.featureImportances.toArray(), columns=['importance'])

importance_df['features'] = pd.Series(feature_cols) # Create a new column to hold feature names

importance_df.sort_values(by=['importance'], ascending=False, inplace=True) # Sort the data based on feature importance

Popularity 6/10 Helpfulness 7/10 Language python

Source: Grepper

Tags: feature-engineering pyspark python

Link to this answer
Share Copy Link

Contributed on Feb 27 2024

Innocent Iguana

0 Answers Avg Quality 2/10

pyspark feature engineering

Contents

More Related Answers

pyspark feature engineering

Grepper

Documentation

Social

Legal

Contact

Oops, You will need to install Grepper and log-in to perform this action.