import pandas as pd
import psycopg2
from sqlalchemy import create_engine
# The Extract phase: Just load the data using pd.read_csv, pd.read_parquet, pd.read_json etc
# The Transform phase
def transform(sql_query, dbname='hr', user='postgres', password='postgres', port='5432', host='localhost'):
# Create a connection to the PostgreSQL database using psycopg2
conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host, port=port)
df = pd.read_sql(sql_query, conn) # Execute the query and load the results into a DataFrame
conn.close() # Close the psycopg2 connection
return df # Return the DataFrame
# The Load phase
def load(df, table_name, dbname='hr', user='postgres', password='postgres', port='5432', host='localhost'):
# Create a connection to the PostgreSQL database using SQLAlchemy for to_sql
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{dbname}')
# Persist the DataFrame to the SQL table
df.to_sql(table_name, engine, if_exists='append', index=False) # if_exists='replace' for replacement
print(f"DataFrame persisted to table '{table_name}' successfully!")
# transform the data from PostgreSQL
query_result = transform('SELECT * FROM jobs')
# Load or Persist the result in the PostgreSQL database
load(query_result, 'final_table')