# Remove non-letter characters
speech_df['text'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ', regex=True)
# Standardize case
speech_df['text'] = speech_df['text'].str.lower()
# Generate Feature : Average length of word
speech_df['char_cnt'] = speech_df['text'].str.len()
speech_df['word_cnt'] = speech_df['text'].str.split().apply(len)
speech_df['avg_word_len'] = speech_df['char_cnt'] / speech_df['word_cnt']
# Generate Feature : tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from nltk.corpus import stopwords
vec = TfidfVectorizer(max_df=0.9, min_df=0.1, max_features=100, stop_words=stop_words)
# Generate Feature : Bag of words / Word Count Vector
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_features=100, stop_words='english', min_df=0.1, max_df=0.9)
# Generate Feature : Introduce context with n-grams
vec = TfidfVectorizer(max_df=0.9, min_df=0.1, max_features=100, stop_words=stop_words, ngram_range = (2,2)) # Find context in 2 consecutive words
vec.fit(speech_df['text'])
transformed = vec.transform(speech_df['text'])
vec_df = pd.DataFrame(transformed.toarray(), columns=vec.get_feature_names_out()).add_prefix('Counts_')
# Sanity check : Find common words / patterns
vec_df.iloc[0].sort_values(ascending=False).head()
vec_df.sum().sort_values(ascending=False).head()
speech_df = pd.concat([speech_df, vec_df], axis=1, sort=False)
# From text to one-hot encoded feature
from keras.utils import to_categorical
import re
text = re.sub(r'[^\w\s]', '', text) # Replace punctuation marks with empty character
words = text.split() # Split text into words
unique_words = list(set(words)) # Get unique words
word_to_index = {word: i for i, word in enumerate(unique_words)} # Create dictionary with word as key and index as value
numeric_text = [word_to_index[word] for word in words] # Map words to numeric representation
one_hot_encoded = to_categorical(numeric_text, num_classes=len(unique_words)) # One-hot encode using keras to_categorical