Dataset / preprocessing /getNLP.py
vansh9878's picture
files added
825e978
raw
history blame contribute delete
855 Bytes
import spacy
from tensorflow.keras.preprocessing.sequence import pad_sequences
nlp=spacy.load("en_core_web_lg")
MAX_LENGTH=10
def preprocess_texts(texts):
sentence_vectors = []
# Use nlp.pipe() to process texts in batch (much faster)
for doc in nlp.pipe(texts, batch_size=1000):
vectors = [token.vector for token in doc if token.has_vector] # Extract word vectors
sentence_vectors.append(vectors)
# Pad all vectors to fixed length
return pad_sequences(sentence_vectors, maxlen=MAX_LENGTH, dtype='float32', padding='post', truncating='post')
def wordEmbed(df,columns):
for col in columns:
processed_array = preprocess_texts(df[col].tolist())
df["processed"+col] = [processed_array[i] for i in range(len(df))]
df.drop(columns=columns,inplace=True)
# print(df.head())
return df