data = [
('A', 0.7, 'm', 1.0),
('B', 0.1, 'f', 0.3),
('A', 0.8, 'm', 0.2),
('C', 0.2, 'f', 0.5),
('C', 0.5, 'f', 0.6)
]
rdd = sc.parallelize(data)
# Convert label to numerical values
label_mapping = {'A': 0, 'B': 1, 'C': 2}
rdd = rdd.map(lambda x: (label_mapping[x[0]], x[1:]))
# Convert data to LabeledPoint (helps to identify the labels as rdd.label and features as rdd.features)
labeled_rdd = rdd.map(lambda x: LabeledPoint(x[0], Vectors.dense(x[1])))
# One-hot encoding ( using Pipeline and pyspark dataframe construct)
pipeline = Pipeline(stages=[
StringIndexer(inputCol='_2', outputCol='gender_index'),
OneHotEncoder(inputCol='gender_index', outputCol='gender_encoded')
])
df = spark.createDataFrame(labeled_rdd, ["label", "features"])
pipeline_model = pipeline.fit(df)
df = pipeline_model.transform(df)
# Convert DataFrame back to RDD
rdd = df.rdd.map(lambda row: (row.label, row.features, row.gender_encoded))
# Split the data into training and testing sets
(trainingData, testData) = rdd.randomSplit([0.8, 0.2])
# Train the model
model_lr = LogisticRegressionWithLBFGS.train(trainingData)
model_lin = LinearRegressionWithSGD.train(trainingData, iterations=100, step=0.1)
model_rf = RandomForest.trainClassifier(trainingData, numClasses=3, categoricalFeaturesInfo={},
numTrees=10, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
kmeans_model = KMeans.train(trainingData.map(lambda x: x[1]), k=3, maxIterations=10, initializationMode="random")
# Train the model with deep learning
input_size = len(trainingData.first()[1])
output_size = 3 # number of classes
hidden_layers = [input_size, 5, output_size] # input layer size, hidden layer sizes, output layer size
model_mlp = MultilayerPerceptronClassifier.train(trainingData, iterations=100, stepSize=0.1, layers=hidden_layers)
# Make predictions on the test data
predictions = testData.map(lambda x: (model_.predict(x[1]), x[0]))
predictions_kmeans = kmeans_model.predict(testData.map(lambda x: x[1]))
# Evaluate MLP, Random Forest model
metrics = MulticlassMetrics(predictions)
accuracy = metrics_mlp.accuracy
# Evaluate Logistic Regression model
metrics_lr = BinaryClassificationMetrics(predictions_lr)
auc_roc_lr = metrics_lr.areaUnderROC
# Evaluate Linear Regression model
metrics_lin = RegressionMetrics(predictions_lin)
rmse_lin = metrics_lin.rootMeanSquaredError
# Compute R-squared for KMeans model
def calculate_rmse(predictions):
return np.sqrt(predictions.map(lambda x: (x[0] - x[1]) ** 2).mean())
# Calculate RMSE for KMeans model
rmse_kmeans = calculate_rmse(testData.map(lambda x: (predictions_kmeans.predict(x[1]), x[0])))
sc.stop()