Breaking News: Grepper is joining You.com. Read the official announcement!

pyspark dataframe common operations

Innocent Iguana answered on February 18, 2024 Popularity 8/10 Helpfulness 5/10

answer pyspark dataframe common operations

pyspark dataframe common operations

Comment

Tip Innocent Iguana 1 GREPCC

# Create dataframe from RDD
spark_df = spark.createDataFrame(RDD, schema=colname_list)
# Loading file (folder name will make the spark load all files in that folder in parallel mode)
from pyspark.sql.types import *
dataSchema = StructType([ StructField('col1', StringType(), , nullable=True),
                            StructField('col2', StringType(), , nullable=False)])
df = spark.read.csv("file.csv", header=True, schema=dataSchema, comment='#', sep=',') # .json, .txt, .load for parquet
df = spark.read.format('csv').options(Header=True).load(name='filename.csv') # schema=dataSchema
df.write.parquet('filename.parquet', mode='overwrite') # Save file (parquet is more efficient, binary format for big data)
df.write.format('parquet').save('filename.parquet')
df.show(3) # Show first 3 rows
df.collect() # Store result as list of tuples
df.limit(3) # Same as show
df.dtypes # See datatype of each column
df.printSchema() # See schema information
result.columns # See result table columns
df.filter(~ col('col').isNull()) # Check for nulls
df = df.na.drop(subset=["col_name"]) # Drop nulls
df = df.drop(subset=["col_name"]) # Drop column
df = df.dropDuplicates() # Drop duplicates
df = df.withColumn("col_name", col("col_name").cast("float"))  # Way 1 : Casting a column to another data type
df = df.withColumn("col_name", df.col_name.cast("float")) # Way 2 : Casting a column to another data type
from pyspark.sql.types import IntegerType # Remember: ArrayType is homogeneous, use StructType([]) for heterogeneity
df = df.withColumn('casted_col', df['col_name'].cast(IntegerType())) # Way 3 : Casting 
df.describe().show() # Summary stats
df.agg({'col_name':'max'}).first()[0] # Maximum value of a column
df = df.repartition(4, 'some_col') # create 4 partitions using same column values of specified column
print(df.rdd.getNumPartitions()) # See no of partitions of the dataset
df = df.coalesce(num_partitions) # Reduce the number of partitions = reduce shuffling (distribution of data to the nodes)
df_split = df.withColumn("DataSplit", split(df["Data"], ",")) # Split values and store as list in a column
df_explode = df_split.withColumn("ExplodedData", explode(df_split["DataSplit"])) # Store each element of list in separate rows
df_pivot = df_explode.groupBy("Key").pivot("ExplodedData").count().fillna(0) # pivot count of each element in separate columns 
df = df.select(df.col1, df.col2, df.col3) # way1 : select column from dataframe
df = df.select("col1", "col2") # way2 : select column from dataframe
df.select(col('col1'), col('col2')) # way3 : select column from dataframe,  import col from sql.functions
df = df.withColumn("new_col",df.old_col+10) # Add a new result column
df = df.withColumnRenamed("old_col_name", "new_col_name") # Rename column
df = df.select(col('col1').alias('col1_renamed'), 'col2')
df = df.selectExpr("col1", "col2", "col3", "col1/(col2/60) as another_col")
df = df.withColumn("idx", monotonically_increasing_id()) # Creating id column
df.where(array_contains('col', 'abc')) # Check if an element is inside an array
df1 = df1.withColumn("source", lit("df1")) # Adding constants in a column
df_vertical = df1.union(df2) # Vertical join (append rows vertically)
df_horizontal = df1.join(df1, on=['common_col1', 'common_col2'], how="left") (append columns horizontally with join)
joined_df = df1.join(df2, df1["colx"] == df2["coly"] , how="inner") # Alternative way : Join 
combined_df = df_1.join(broadcast(df_2)) # Prevents undue / excess communication between nodes by giving a nroadcasted copy to each
df_cross = df1.crossJoin(df2) # Cross Join (Horizontally appending columns of possible combinations)
# Filtering (Both produces same results)
df = df.filter("col_name > 120").show()
df = df.where("Value > 120")
df = df.filter(df.col_name > 120).show()
df = df.where(df.Value > 120)
filterA = df.col1 == "SEA"
result = temp.filter(filterA).filter(filterB) # Chaining filters
df.groupBy("col_name").count() # Group by and count
df.orderBy("col_name") # order by 
df.filter(df.col == 'value').groupBy().max("another_col") # Multiple chaining aggregation
df.createOrReplaceTempView("table_name") # Register DataFrame as a temporary talbe in catalog
spark.catalog.listTables() # See all table information in the catalog
spark.catalog.dropTempView('table_name') # Remove temp table from catalog
spark_df = spark.table("table_name") # start using a spark table as spark dataframe
result = spark.sql("SELECT * FROM table_name") # Run query on table
from pyspark.sql.functions import udf # Using PYSPARK CUSTOM FUNCTION
from pyspark.sql.types import IntegerType
def double_val(col):
    return col * 2 # Make sure any new data is casted to proper type
double_val_udf = udf(double_val, IntegerType()) # Register UDF with custom function and return type
df = df.withColumn("DoubledCol", custom_func(df["col"]))
## PYSPARK VISUALIZATION

## NOTE
# Array: [1.0, 0.0, 0.0, 3.0]
# Sparse vector: (4, [0, 3], [1.0, 3.0])

xxxxxxxxxx

# Create dataframe from RDD

spark_df = spark.createDataFrame(RDD, schema=colname_list)

# Loading file (folder name will make the spark load all files in that folder in parallel mode)

from pyspark.sql.types import *

dataSchema = StructType([ StructField('col1', StringType(), , nullable=True),

                            StructField('col2', StringType(), , nullable=False)])

df = spark.read.csv("file.csv", header=True, schema=dataSchema, comment='#', sep=',') # .json, .txt, .load for parquet

df = spark.read.format('csv').options(Header=True).load(name='filename.csv') # schema=dataSchema

df.write.parquet('filename.parquet', mode='overwrite') # Save file (parquet is more efficient, binary format for big data)

df.write.format('parquet').save('filename.parquet')

df.show(3) # Show first 3 rows

df.collect() # Store result as list of tuples

df.limit(3) # Same as show

df.dtypes # See datatype of each column

df.printSchema() # See schema information

result.columns # See result table columns

df.filter(~ col('col').isNull()) # Check for nulls

df = df.na.drop(subset=["col_name"]) # Drop nulls

df = df.drop(subset=["col_name"]) # Drop column

df = df.dropDuplicates() # Drop duplicates

df = df.withColumn("col_name", col("col_name").cast("float"))  # Way 1 : Casting a column to another data type

df = df.withColumn("col_name", df.col_name.cast("float")) # Way 2 : Casting a column to another data type

from pyspark.sql.types import IntegerType # Remember: ArrayType is homogeneous, use StructType([]) for heterogeneity

df = df.withColumn('casted_col', df['col_name'].cast(IntegerType())) # Way 3 : Casting

df.describe().show() # Summary stats

df.agg({'col_name':'max'}).first()[0] # Maximum value of a column

df = df.repartition(4, 'some_col') # create 4 partitions using same column values of specified column

print(df.rdd.getNumPartitions()) # See no of partitions of the dataset

df = df.coalesce(num_partitions) # Reduce the number of partitions = reduce shuffling (distribution of data to the nodes)

df_split = df.withColumn("DataSplit", split(df["Data"], ",")) # Split values and store as list in a column

df_explode = df_split.withColumn("ExplodedData", explode(df_split["DataSplit"])) # Store each element of list in separate rows

df_pivot = df_explode.groupBy("Key").pivot("ExplodedData").count().fillna(0) # pivot count of each element in separate columns

df = df.select(df.col1, df.col2, df.col3) # way1 : select column from dataframe

df = df.select("col1", "col2") # way2 : select column from dataframe

df.select(col('col1'), col('col2')) # way3 : select column from dataframe,  import col from sql.functions

df = df.withColumn("new_col",df.old_col+10) # Add a new result column

df = df.withColumnRenamed("old_col_name", "new_col_name") # Rename column

df = df.select(col('col1').alias('col1_renamed'), 'col2')

df = df.selectExpr("col1", "col2", "col3", "col1/(col2/60) as another_col")

df = df.withColumn("idx", monotonically_increasing_id()) # Creating id column

df.where(array_contains('col', 'abc')) # Check if an element is inside an array

df1 = df1.withColumn("source", lit("df1")) # Adding constants in a column

df_vertical = df1.union(df2) # Vertical join (append rows vertically)

df_horizontal = df1.join(df1, on=['common_col1', 'common_col2'], how="left") (append columns horizontally with join)

joined_df = df1.join(df2, df1["colx"] == df2["coly"] , how="inner") # Alternative way : Join

combined_df = df_1.join(broadcast(df_2)) # Prevents undue / excess communication between nodes by giving a nroadcasted copy to each

df_cross = df1.crossJoin(df2) # Cross Join (Horizontally appending columns of possible combinations)

# Filtering (Both produces same results)

df = df.filter("col_name > 120").show()

df = df.where("Value > 120")

df = df.filter(df.col_name > 120).show()

df = df.where(df.Value > 120)

filterA = df.col1 == "SEA"

result = temp.filter(filterA).filter(filterB) # Chaining filters

df.groupBy("col_name").count() # Group by and count

df.orderBy("col_name") # order by

df.filter(df.col == 'value').groupBy().max("another_col") # Multiple chaining aggregation

df.createOrReplaceTempView("table_name") # Register DataFrame as a temporary talbe in catalog

spark.catalog.listTables() # See all table information in the catalog

spark.catalog.dropTempView('table_name') # Remove temp table from catalog

spark_df = spark.table("table_name") # start using a spark table as spark dataframe

result = spark.sql("SELECT * FROM table_name") # Run query on table

from pyspark.sql.functions import udf # Using PYSPARK CUSTOM FUNCTION

from pyspark.sql.types import IntegerType

def double_val(col):

    return col * 2 # Make sure any new data is casted to proper type

double_val_udf = udf(double_val, IntegerType()) # Register UDF with custom function and return type

df = df.withColumn("DoubledCol", custom_func(df["col"]))

## PYSPARK VISUALIZATION

## NOTE

# Array: [1.0, 0.0, 0.0, 3.0]

# Sparse vector: (4, [0, 3], [1.0, 3.0])

Popularity 8/10 Helpfulness 5/10 Language python

Source: Grepper

Tags: operations python

Link to this answer
Share Copy Link

Contributed on Feb 18 2024

Innocent Iguana

0 Answers Avg Quality 2/10

pyspark dataframe common operations

Contents

More Related Answers

pyspark dataframe common operations

Grepper

Documentation

Social

Legal

Contact

Oops, You will need to install Grepper and log-in to perform this action.