pyspark rdd machine learning

Comment

3

Tip Innocent Iguana 1 GREPCC

data = [
    ('A', 0.7, 'm', 1.0),
    ('B', 0.1, 'f', 0.3),
    ('A', 0.8, 'm', 0.2),
    ('C', 0.2, 'f', 0.5),
    ('C', 0.5, 'f', 0.6)
]
rdd = sc.parallelize(data)

# Convert label to numerical values
label_mapping = {'A': 0, 'B': 1, 'C': 2}
rdd = rdd.map(lambda x: (label_mapping[x[0]], x[1:]))
# Convert data to LabeledPoint (helps to identify the labels as rdd.label and features as rdd.features)
labeled_rdd = rdd.map(lambda x: LabeledPoint(x[0], Vectors.dense(x[1])))

# One-hot encoding ( using Pipeline and pyspark dataframe construct)
pipeline = Pipeline(stages=[
    StringIndexer(inputCol='_2', outputCol='gender_index'),
    OneHotEncoder(inputCol='gender_index', outputCol='gender_encoded')
])
df = spark.createDataFrame(labeled_rdd, ["label", "features"])
pipeline_model = pipeline.fit(df)
df = pipeline_model.transform(df)
# Convert DataFrame back to RDD
rdd = df.rdd.map(lambda row: (row.label, row.features, row.gender_encoded))

# Split the data into training and testing sets
(trainingData, testData) = rdd.randomSplit([0.8, 0.2])

# Train the model 
model_lr = LogisticRegressionWithLBFGS.train(trainingData)
model_lin = LinearRegressionWithSGD.train(trainingData, iterations=100, step=0.1)
model_rf = RandomForest.trainClassifier(trainingData, numClasses=3, categoricalFeaturesInfo={},
                                     numTrees=10, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)
kmeans_model = KMeans.train(trainingData.map(lambda x: x[1]), k=3, maxIterations=10, initializationMode="random")

# Train the model with deep learning 
input_size = len(trainingData.first()[1])
output_size = 3  # number of classes
hidden_layers = [input_size, 5, output_size]  # input layer size, hidden layer sizes, output layer size
model_mlp = MultilayerPerceptronClassifier.train(trainingData, iterations=100, stepSize=0.1, layers=hidden_layers)

# Make predictions on the test data
predictions = testData.map(lambda x: (model_.predict(x[1]), x[0]))
predictions_kmeans = kmeans_model.predict(testData.map(lambda x: x[1]))

# Evaluate MLP, Random Forest model
metrics = MulticlassMetrics(predictions)
accuracy = metrics_mlp.accuracy

# Evaluate Logistic Regression model
metrics_lr = BinaryClassificationMetrics(predictions_lr)
auc_roc_lr = metrics_lr.areaUnderROC

# Evaluate Linear Regression model
metrics_lin = RegressionMetrics(predictions_lin)
rmse_lin = metrics_lin.rootMeanSquaredError

# Compute R-squared for KMeans model
def calculate_rmse(predictions):
    return np.sqrt(predictions.map(lambda x: (x[0] - x[1]) ** 2).mean())

# Calculate RMSE for KMeans model
rmse_kmeans = calculate_rmse(testData.map(lambda x: (predictions_kmeans.predict(x[1]), x[0])))

sc.stop()

xxxxxxxxxx

data = [

    ('A', 0.7, 'm', 1.0),

    ('B', 0.1, 'f', 0.3),

    ('A', 0.8, 'm', 0.2),

    ('C', 0.2, 'f', 0.5),

    ('C', 0.5, 'f', 0.6)

rdd = sc.parallelize(data)

# Convert label to numerical values

label_mapping = {'A': 0, 'B': 1, 'C': 2}

rdd = rdd.map(lambda x: (label_mapping[x[0]], x[1:]))

# Convert data to LabeledPoint (helps to identify the labels as rdd.label and features as rdd.features)

labeled_rdd = rdd.map(lambda x: LabeledPoint(x[0], Vectors.dense(x[1])))

# One-hot encoding ( using Pipeline and pyspark dataframe construct)

pipeline = Pipeline(stages=[

    StringIndexer(inputCol='_2', outputCol='gender_index'),

    OneHotEncoder(inputCol='gender_index', outputCol='gender_encoded')

])

df = spark.createDataFrame(labeled_rdd, ["label", "features"])

pipeline_model = pipeline.fit(df)

df = pipeline_model.transform(df)

# Convert DataFrame back to RDD

rdd = df.rdd.map(lambda row: (row.label, row.features, row.gender_encoded))

# Split the data into training and testing sets

(trainingData, testData) = rdd.randomSplit([0.8, 0.2])

# Train the model

model_lr = LogisticRegressionWithLBFGS.train(trainingData)

model_lin = LinearRegressionWithSGD.train(trainingData, iterations=100, step=0.1)

model_rf = RandomForest.trainClassifier(trainingData, numClasses=3, categoricalFeaturesInfo={},

                                     numTrees=10, featureSubsetStrategy="auto",

                                     impurity='gini', maxDepth=4, maxBins=32)

kmeans_model = KMeans.train(trainingData.map(lambda x: x[1]), k=3, maxIterations=10, initializationMode="random")

# Train the model with deep learning

input_size = len(trainingData.first()[1])

output_size = 3  # number of classes

hidden_layers = [input_size, 5, output_size]  # input layer size, hidden layer sizes, output layer size

model_mlp = MultilayerPerceptronClassifier.train(trainingData, iterations=100, stepSize=0.1, layers=hidden_layers)

# Make predictions on the test data

predictions = testData.map(lambda x: (model_.predict(x[1]), x[0]))

predictions_kmeans = kmeans_model.predict(testData.map(lambda x: x[1]))

# Evaluate MLP, Random Forest model

metrics = MulticlassMetrics(predictions)

accuracy = metrics_mlp.accuracy

# Evaluate Logistic Regression model

metrics_lr = BinaryClassificationMetrics(predictions_lr)

auc_roc_lr = metrics_lr.areaUnderROC

# Evaluate Linear Regression model

metrics_lin = RegressionMetrics(predictions_lin)

rmse_lin = metrics_lin.rootMeanSquaredError

# Compute R-squared for KMeans model

def calculate_rmse(predictions):

    return np.sqrt(predictions.map(lambda x: (x[0] - x[1]) ** 2).mean())

# Calculate RMSE for KMeans model

rmse_kmeans = calculate_rmse(testData.map(lambda x: (predictions_kmeans.predict(x[1]), x[0])))

sc.stop()

Popularity 6/10 Helpfulness 5/10 Language python

Source: Grepper

Tags: machine-learning pyspark python rdd

Share

Link to this answer
Share Copy Link

Contributed on Feb 18 2024

Innocent Iguana

0 Answers Avg Quality 2/10

pyspark als rdd

Comment

4

Tip Innocent Iguana 1 GREPCC

############# using pyspark RDD ##############
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, Rating

# Initialize SparkContext
sc = SparkContext("local", "ALS Example")

# Sample data (User ID, Item ID, Rating, Additional Column1, Additional Column2)
data = [
    (1, 1, 5, "A", "X"),
    (1, 2, 4, "B", "Y"),
    (2, 1, 3, "C", "Z"),
    (2, 2, 5, "D", "W"), # <--- say this data is for testing
    (3, 1, 4, "E", "V"),
    (3, 2, 2, "F", "U")
]
ratings_rdd = sc.parallelize(data)

# Map the data to Rating objects (User ID, Item ID, Rating)
ratings = ratings_rdd.map(lambda x: Rating(x[0], x[1], x[2])) # Rating(user=1, product=1, rating=5.0)
training_data, test_data = ratings.randomSplit([0.8, 0.2]) # Split data into training and test sets

# Train ALS model
rank = 10  # Number of latent factors
num_iterations = 10  # Number of iterations
model = ALS.train(training_data, rank, num_iterations)
test_user_item_pairs = test_data.map(lambda x: (x[0], x[1])) # Make predictions on test data
predictions = model.predictAll(test_user_item_pairs)
predictions = predictions.map(lambda r: ((r[0], r[1]), r[2])) # ((2, 2), 5.008601768134059)

# Join predicted ratings with actual ratings 
rates_and_preds = test_data.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) # ((2, 2), (5.0, 5.008601768134059))
RMSE = (rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())**0.5 # Calculate RMSE

xxxxxxxxxx

############# using pyspark RDD ##############

from pyspark import SparkContext

from pyspark.mllib.recommendation import ALS, Rating

# Initialize SparkContext

sc = SparkContext("local", "ALS Example")

# Sample data (User ID, Item ID, Rating, Additional Column1, Additional Column2)

data = [

    (1, 1, 5, "A", "X"),

    (1, 2, 4, "B", "Y"),

    (2, 1, 3, "C", "Z"),

    (2, 2, 5, "D", "W"), # <--- say this data is for testing

    (3, 1, 4, "E", "V"),

    (3, 2, 2, "F", "U")

ratings_rdd = sc.parallelize(data)

# Map the data to Rating objects (User ID, Item ID, Rating)

ratings = ratings_rdd.map(lambda x: Rating(x[0], x[1], x[2])) # Rating(user=1, product=1, rating=5.0)

training_data, test_data = ratings.randomSplit([0.8, 0.2]) # Split data into training and test sets

# Train ALS model

rank = 10  # Number of latent factors

num_iterations = 10  # Number of iterations

model = ALS.train(training_data, rank, num_iterations)

test_user_item_pairs = test_data.map(lambda x: (x[0], x[1])) # Make predictions on test data

predictions = model.predictAll(test_user_item_pairs)

predictions = predictions.map(lambda r: ((r[0], r[1]), r[2])) # ((2, 2), 5.008601768134059)

# Join predicted ratings with actual ratings

rates_and_preds = test_data.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) # ((2, 2), (5.0, 5.008601768134059))

RMSE = (rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())**0.5 # Calculate RMSE

Popularity 6/10 Helpfulness 5/10 Language python

Source: Grepper

Tags: als pyspark python rdd

Share

Link to this answer
Share Copy Link

Contributed on Mar 05 2024

Innocent Iguana

0 Answers Avg Quality 2/10

pyspark rdd example

Comment

0

Tip Sanchit Rabadey 1 GREPCC

# Create RDD from parallelize    
dataList = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]
rdd=spark.sparkContext.parallelize(dataList)

xxxxxxxxxx

# Create RDD from parallelize

dataList = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]

rdd=spark.sparkContext.parallelize(dataList)

Popularity 9/10 Helpfulness 3/10 Language python

Source: sparkbyexamples.com

Tags: pyspark

Share

Link to this answer
Share Copy Link

Contributed on Aug 16 2022

Sanchit Rabadey

0 Answers Avg Quality 2/10

pyspark rdd machine learning

Contents

More Related Answers