import spacy from tensorflow.keras.preprocessing.sequence import pad_sequences nlp=spacy.load("en_core_web_lg") MAX_LENGTH=10 def preprocess_texts(texts): sentence_vectors = [] # Use nlp.pipe() to process texts in batch (much faster) for doc in nlp.pipe(texts, batch_size=1000): vectors = [token.vector for token in doc if token.has_vector] # Extract word vectors sentence_vectors.append(vectors) # Pad all vectors to fixed length return pad_sequences(sentence_vectors, maxlen=MAX_LENGTH, dtype='float32', padding='post', truncating='post') def wordEmbed(df,columns): for col in columns: processed_array = preprocess_texts(df[col].tolist()) df["processed"+col] = [processed_array[i] for i in range(len(df))] df.drop(columns=columns,inplace=True) # print(df.head()) return df