########### Splitting non-sequence data
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
########### Splitting sequence data with inherent sequence (eg : Time Series)
# Find how many days our data spans
from pyspark.sql.functions import datediff
range_in_days = datediff(max_date, min_date) # Find the no of days beteen minimum and maximum date
# Find the date to split the dataset on
from pyspark.sql.functions import date_add
split_in_days = round(range_in_days * 0.8) # Find 80% date split point
split_date = date_add(min_date, split_in_days) # Add split point with minimum date to get the split date
# Split the data into 80% train, 20% test
train_df = df.where(df['DATE'] < split_date) # Use filtering with split date to take only training data
test_df = df.where(df['DATE'] >= split_date) # Use filtering with split date to take only testing data