# Create dataframe from RDD
spark_df = spark.createDataFrame(RDD, schema=colname_list)
# Loading file (folder name will make the spark load all files in that folder in parallel mode)
from pyspark.sql.types import *
dataSchema = StructType([ StructField('col1', StringType(), , nullable=True),
StructField('col2', StringType(), , nullable=False)])
df = spark.read.csv("file.csv", header=True, schema=dataSchema, comment='#', sep=',') # .json, .txt, .load for parquet
df = spark.read.format('csv').options(Header=True).load(name='filename.csv') # schema=dataSchema
df.write.parquet('filename.parquet', mode='overwrite') # Save file (parquet is more efficient, binary format for big data)
df.write.format('parquet').save('filename.parquet')
df.show(3) # Show first 3 rows
df.collect() # Store result as list of tuples
df.limit(3) # Same as show
df.dtypes # See datatype of each column
df.printSchema() # See schema information
result.columns # See result table columns
df.filter(~ col('col').isNull()) # Check for nulls
df = df.na.drop(subset=["col_name"]) # Drop nulls
df = df.drop(subset=["col_name"]) # Drop column
df = df.dropDuplicates() # Drop duplicates
df = df.withColumn("col_name", col("col_name").cast("float")) # Way 1 : Casting a column to another data type
df = df.withColumn("col_name", df.col_name.cast("float")) # Way 2 : Casting a column to another data type
from pyspark.sql.types import IntegerType # Remember: ArrayType is homogeneous, use StructType([]) for heterogeneity
df = df.withColumn('casted_col', df['col_name'].cast(IntegerType())) # Way 3 : Casting
df.describe().show() # Summary stats
df.agg({'col_name':'max'}).first()[0] # Maximum value of a column
df = df.repartition(4, 'some_col') # create 4 partitions using same column values of specified column
print(df.rdd.getNumPartitions()) # See no of partitions of the dataset
df = df.coalesce(num_partitions) # Reduce the number of partitions = reduce shuffling (distribution of data to the nodes)
df_split = df.withColumn("DataSplit", split(df["Data"], ",")) # Split values and store as list in a column
df_explode = df_split.withColumn("ExplodedData", explode(df_split["DataSplit"])) # Store each element of list in separate rows
df_pivot = df_explode.groupBy("Key").pivot("ExplodedData").count().fillna(0) # pivot count of each element in separate columns
df = df.select(df.col1, df.col2, df.col3) # way1 : select column from dataframe
df = df.select("col1", "col2") # way2 : select column from dataframe
df.select(col('col1'), col('col2')) # way3 : select column from dataframe, import col from sql.functions
df = df.withColumn("new_col",df.old_col+10) # Add a new result column
df = df.withColumnRenamed("old_col_name", "new_col_name") # Rename column
df = df.select(col('col1').alias('col1_renamed'), 'col2')
df = df.selectExpr("col1", "col2", "col3", "col1/(col2/60) as another_col")
df = df.withColumn("idx", monotonically_increasing_id()) # Creating id column
df.where(array_contains('col', 'abc')) # Check if an element is inside an array
df1 = df1.withColumn("source", lit("df1")) # Adding constants in a column
df_vertical = df1.union(df2) # Vertical join (append rows vertically)
df_horizontal = df1.join(df1, on=['common_col1', 'common_col2'], how="left") (append columns horizontally with join)
joined_df = df1.join(df2, df1["colx"] == df2["coly"] , how="inner") # Alternative way : Join
combined_df = df_1.join(broadcast(df_2)) # Prevents undue / excess communication between nodes by giving a nroadcasted copy to each
df_cross = df1.crossJoin(df2) # Cross Join (Horizontally appending columns of possible combinations)
# Filtering (Both produces same results)
df = df.filter("col_name > 120").show()
df = df.where("Value > 120")
df = df.filter(df.col_name > 120).show()
df = df.where(df.Value > 120)
filterA = df.col1 == "SEA"
result = temp.filter(filterA).filter(filterB) # Chaining filters
df.groupBy("col_name").count() # Group by and count
df.orderBy("col_name") # order by
df.filter(df.col == 'value').groupBy().max("another_col") # Multiple chaining aggregation
df.createOrReplaceTempView("table_name") # Register DataFrame as a temporary talbe in catalog
spark.catalog.listTables() # See all table information in the catalog
spark.catalog.dropTempView('table_name') # Remove temp table from catalog
spark_df = spark.table("table_name") # start using a spark table as spark dataframe
result = spark.sql("SELECT * FROM table_name") # Run query on table
from pyspark.sql.functions import udf # Using PYSPARK CUSTOM FUNCTION
from pyspark.sql.types import IntegerType
def double_val(col):
return col * 2 # Make sure any new data is casted to proper type
double_val_udf = udf(double_val, IntegerType()) # Register UDF with custom function and return type
df = df.withColumn("DoubledCol", custom_func(df["col"]))
## PYSPARK VISUALIZATION
## NOTE
# Array: [1.0, 0.0, 0.0, 3.0]
# Sparse vector: (4, [0, 3], [1.0, 3.0])