# Binary Encoding
df["cat_col"] = df["cat_col"].apply(lambda val: 1 if val == "y" else 0)
# One-hot-encoding on categorical variable
df_onehot = pd.get_dummies(df, columns=['cat'], prefix='C')
df_dummy = pd.get_dummies(df, columns=['cat'], drop_first=True, prefix='C')
# Alternative approach-2
from sklearn import preprocessing
encoder = preprocessing.OneHotEncoder()
onehot_transformed = encoder.fit_transform(df['cat_col'].values.reshape(-1,1))
# Convert into dataframe
onehot_df = pd.DataFrame(onehot_transformed.toarray())
# Add the encoded columns with original dataset,
df = pd.concat([df, onehot_df], axis=1)
# Drop the original column that you used for encoding
df = df.drop('cat_col', axis=1)
# Label encoding : Turning string labels into numeric values
from sklearn import preprocessing
encoder_lvl = preprocessing.LabelEncoder()
# Specify the unique categories in the column to apply one-hot encoding
encoder_lvl.fit([ 'LOW', 'NORMAL', 'HIGH'])
# Apply one hot encoding on the third column of the dataset
df[:,2] = encoder_lvl.transform(df[:,2])
# Alternative approach : DictVectorizer
from sklearn.feature_extraction import DictVectorizer
df_dict = df.to_dict("records") # Convert df into a list of dictionary
dv = DictVectorizer(sparse = False)
df_encoded = dv.fit_transform(df_dict)
print(df_encoded[:5,:]) # Print first five rows
# Print the vocabulary (how the features are mapped to columns in the resulting matrix.)
print(dv.vocabulary_)
# Alternative appraach : Use pandas only
# Turn response variable into labeled codes
df.cat_col = pd.Categorical(df.cat_col)
df.cat_col = df.cat_col.cat.codes
from tensorflow.keras.utils import to_categorical # For categorical target variable, you need something like one-hot-encoding
y = to_categorical(data['target']) # Use this for one-hot-encoding if target is a class and it is a classification problem
# Given one-hot encoded arrays of predictions, how do we calculate the percentage of correct predictions?
number_correct = (test_labels*predictions).sum() # Calculate the number of correct predictions
proportion_correct = number_correct / test_labels.shape[0] # Calculate the proportion of correct predictions
from keras.utils import to_categorical
import re
text = re.sub(r'[^\w\s]', '', text) # Replace punctuation marks with empty character
words = text.split() # Split text into words
unique_words = list(set(words)) # Get unique words
word_to_index = {word: i for i, word in enumerate(unique_words)} # Create dictionary with word as key and index as value
numeric_text = [word_to_index[word] for word in words] # Map words to numeric representation
one_hot_encoded = to_categorical(numeric_text, num_classes=len(unique_words)) # One-hot encode using keras to_categorical
### Alternative approach: directly using the words without mapping them to integers
onehot_2 = to_categorical(words, num_classes=5)
print([(w,ohe.tolist()) for w,ohe in zip(words, onehot_2)])