Breaking News: Grepper is joining You.com. Read the official announcement!

text feature engineering python

Innocent Iguana answered on January 13, 2024 Popularity 7/10 Helpfulness 5/10

answer text feature engineering python

text feature engineering python

Comment

Tip Innocent Iguana 1 GREPCC

# Remove non-letter characters
speech_df['text'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ', regex=True)
# Standardize case
speech_df['text'] = speech_df['text'].str.lower()
# Generate Feature : Average length of word
speech_df['char_cnt'] = speech_df['text'].str.len()
speech_df['word_cnt'] = speech_df['text'].str.split().apply(len)
speech_df['avg_word_len'] = speech_df['char_cnt'] / speech_df['word_cnt']

# Generate Feature : tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from nltk.corpus import stopwords
vec = TfidfVectorizer(max_df=0.9, min_df=0.1, max_features=100, stop_words=stop_words) 

# Generate Feature : Bag of words / Word Count Vector
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_features=100, stop_words='english', min_df=0.1, max_df=0.9)

# Generate Feature : Introduce context with n-grams
vec = TfidfVectorizer(max_df=0.9, min_df=0.1, max_features=100, stop_words=stop_words, ngram_range = (2,2)) # Find context in 2 consecutive words
vec.fit(speech_df['text'])
transformed = vec.transform(speech_df['text'])
vec_df = pd.DataFrame(transformed.toarray(), columns=vec.get_feature_names_out()).add_prefix('Counts_')

# Sanity check : Find common words / patterns
vec_df.iloc[0].sort_values(ascending=False).head()
vec_df.sum().sort_values(ascending=False).head()

speech_df = pd.concat([speech_df, vec_df], axis=1, sort=False)

# From text to one-hot encoded feature
from keras.utils import to_categorical
import re

text = re.sub(r'[^\w\s]', '', text) # Replace punctuation marks with empty character
words = text.split() # Split text into words
unique_words = list(set(words)) # Get unique words
word_to_index = {word: i for i, word in enumerate(unique_words)} # Create dictionary with word as key and index as value
numeric_text = [word_to_index[word] for word in words] # Map words to numeric representation
one_hot_encoded = to_categorical(numeric_text, num_classes=len(unique_words)) # One-hot encode using keras to_categorical

xxxxxxxxxx

# Remove non-letter characters

speech_df['text'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ', regex=True)

# Standardize case

speech_df['text'] = speech_df['text'].str.lower()

# Generate Feature : Average length of word

speech_df['char_cnt'] = speech_df['text'].str.len()

speech_df['word_cnt'] = speech_df['text'].str.split().apply(len)

speech_df['avg_word_len'] = speech_df['char_cnt'] / speech_df['word_cnt']

# Generate Feature : tf-idf

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk

# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

from nltk.corpus import stopwords

vec = TfidfVectorizer(max_df=0.9, min_df=0.1, max_features=100, stop_words=stop_words)

# Generate Feature : Bag of words / Word Count Vector

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_features=100, stop_words='english', min_df=0.1, max_df=0.9)

# Generate Feature : Introduce context with n-grams

vec = TfidfVectorizer(max_df=0.9, min_df=0.1, max_features=100, stop_words=stop_words, ngram_range = (2,2)) # Find context in 2 consecutive words

vec.fit(speech_df['text'])

transformed = vec.transform(speech_df['text'])

vec_df = pd.DataFrame(transformed.toarray(), columns=vec.get_feature_names_out()).add_prefix('Counts_')

# Sanity check : Find common words / patterns

vec_df.iloc[0].sort_values(ascending=False).head()

vec_df.sum().sort_values(ascending=False).head()

speech_df = pd.concat([speech_df, vec_df], axis=1, sort=False)

# From text to one-hot encoded feature

from keras.utils import to_categorical

import re

text = re.sub(r'[^\w\s]', '', text) # Replace punctuation marks with empty character

words = text.split() # Split text into words

unique_words = list(set(words)) # Get unique words

word_to_index = {word: i for i, word in enumerate(unique_words)} # Create dictionary with word as key and index as value

numeric_text = [word_to_index[word] for word in words] # Map words to numeric representation

one_hot_encoded = to_categorical(numeric_text, num_classes=len(unique_words)) # One-hot encode using keras to_categorical

Popularity 7/10 Helpfulness 5/10 Language python

Source: Grepper

Tags: feature-engineering python text

Link to this answer
Share Copy Link

Contributed on Jan 13 2024

Innocent Iguana

0 Answers Avg Quality 2/10

text feature engineering python

Contents

More Related Answers

text feature engineering python

Grepper

Documentation

Social

Legal

Contact

Oops, You will need to install Grepper and log-in to perform this action.