from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import functions as F
from pyspark.sql.window import Window
def perform_data_quality_checks(input_df):
"""
Perform basic data quality checks on a PySpark DataFrame.
Args:
input_df (pyspark.sql.DataFrame): Input DataFrame for data quality checks.
Returns:
pyspark.sql.DataFrame: DataFrame with additional columns indicating data quality test results.
"""
# Create a copy of the input DataFrame to store data quality test results
output_df = input_df
# Uniqueness Check
for column in input_df.columns:
output_df = output_df.withColumn(f"{column}_is_unique", F.count(column).over(Window.partitionBy(column)).alias(f"{column}_count") == 1)
# Completeness Check
for column in input_df.columns:
output_df = output_df.withColumn(f"{column}_is_complete", F.col(column).isNotNull())
# Data Coverage Check
for column in input_df.columns:
total_records = input_df.count()
output_df = output_df.withColumn(f"{column}_data_coverage", F.count(column).over(Window.partitionBy(column)).alias(f"{column}_count") / total_records)
# Constraint Checks (Example: Checking if a numerical column is greater than 0)
constraint_checks = [
(F.col("age") > 0).alias("age_positive"),
# Add more constraint checks as needed
]
output_df = output_df.withColumn("constraint_checks", F.struct(*constraint_checks))
return output_df
if __name__ == "__main__":
# Initialize Spark session
spark = SparkSession.builder.master("local").appName("DataQualityExample").getOrCreate()
# Create an example DataFrame
data = [
Row(user_id=1, email="john@example.com", age=25),
Row(user_id=2, email="jane@example.com", age=None),
Row(user_id=3, email="john@example.com", age=30),
Row(user_id=4, email="alice@example.com", age=20),
]
schema = StructType([
StructField("user_id", IntegerType(), True),
StructField("email", StringType(), True),
StructField("age", IntegerType(), True),
])
example_df = spark.createDataFrame(data, schema)
# Display the example DataFrame
print("Example DataFrame:")
example_df.show()
# Perform data quality checks
result_df = perform_data_quality_checks(example_df)
# Display the results of data quality checks
print("Results of Data Quality Checks:")
result_df.show(truncate=False)