|
import spacy |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
nlp=spacy.load("en_core_web_lg") |
|
MAX_LENGTH=10 |
|
|
|
def preprocess_texts(texts): |
|
sentence_vectors = [] |
|
|
|
|
|
for doc in nlp.pipe(texts, batch_size=1000): |
|
vectors = [token.vector for token in doc if token.has_vector] |
|
sentence_vectors.append(vectors) |
|
|
|
|
|
return pad_sequences(sentence_vectors, maxlen=MAX_LENGTH, dtype='float32', padding='post', truncating='post') |
|
|
|
def wordEmbed(df,columns): |
|
for col in columns: |
|
processed_array = preprocess_texts(df[col].tolist()) |
|
df["processed"+col] = [processed_array[i] for i in range(len(df))] |
|
df.drop(columns=columns,inplace=True) |
|
|
|
return df |
|
|
|
|
|
|