hongaik's picture
updated code
a27a834
raw
history blame
No virus
4.05 kB
import re
from gensim.models.keyedvectors import KeyedVectors
from transformers import pipeline
import pickle
import numpy as np
import pandas as pd
w2v = KeyedVectors.load('models/word2vec')
w2v_vocab = set(sorted(w2v.index_to_key))
model = pickle.load(open('models/w2v_ovr_svc.sav', 'rb'))
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli", framework='pt'
)
labels = [
'communication', 'waiting time',
'information', 'user interface',
'facilities', 'location', 'price'
]
sample_file = pd.read_csv('sample.csv').to_csv(index=False).encode('utf-8')
print('utils imported!')
def get_sentiment_label_facebook(list_of_sent_dicts):
if list_of_sent_dicts['labels'][0] == 'negative':
return 'negative'
else:
return 'positive'
def get_single_prediction(text):
# manipulate data into a format that we pass to our model
text = text.lower() #lower case
text = re.sub('[^0-9a-zA-Z\s]', '', text) #remove special char, punctuation
# Remove OOV words
text = ' '.join([i for i in text.split() if i in w2v_vocab])
# Vectorise text and store in new dataframe. Sentence vector = average of word vectors
text_vectors = np.mean([w2v[i] for i in text.split()], axis=0)
# Make predictions
results = model.predict_proba(text_vectors.reshape(1,300)).squeeze().round(2)
pred_prob = pd.DataFrame({'topic': labels, 'probability': results}).sort_values('probability', ascending=True)
# Get sentiment
sentiment_results = classifier(text,
candidate_labels=['positive', 'negative'],
hypothesis_template='The sentiment of this is {}')
sentiment_prob = pd.DataFrame({'sentiment': sentiment_results['labels'], 'probability': sentiment_results['scores']})
return (pred_prob, sentiment_prob)
def get_multiple_predictions(csv):
df = pd.read_csv(csv)
df.columns = ['sequence']
df['sequence_clean'] = df['sequence'].str.lower() #lower case
df['sequence_clean'] = df['sequence_clean'].str.strip()
df['sequence_clean'] = df['sequence_clean'].str.replace('[^0-9a-zA-Z\s]','') #remove special char, punctuation
# Remove OOV words
df['sequence_clean'] = df['sequence_clean'].apply(lambda x: ' '.join([i for i in x.split() if i in w2v_vocab]))
# Remove rows with blank string
invalid = df[(pd.isna(df['sequence_clean'])) | (df['sequence_clean'] == '')]
invalid.drop(columns=['sequence_clean'], inplace=True)
# Drop rows with blank string
df.dropna(inplace=True)
df = df[df['sequence_clean'] != ''].reset_index(drop=True)
# Vectorise text and store in new dataframe. Sentence vector = average of word vectors
series_text_vectors = pd.DataFrame(df['sequence_clean'].apply(lambda x: np.mean([w2v[i] for i in x.split()], axis=0)).values.tolist())
# Get predictions
pred_results = pd.DataFrame(model.predict(series_text_vectors), columns = labels)
# Join back to original sequence
final_results = df.join(pred_results)
final_results['others'] = final_results[labels].max(axis=1)
final_results['others'] = final_results['others'].apply(lambda x: 1 if x == 0 else 0)
# Get sentiment labels
final_results['sentiment'] = final_results['sequence_clean'].apply(lambda x: get_sentiment_label_facebook(classifier(x,
candidate_labels=['positive', 'negative'],
hypothesis_template='The sentiment of this is {}'))
)
final_results.drop(columns=['sequence_clean'], inplace=True)
# Append invalid rows
if len(invalid) == 0:
return final_results.to_csv(index=False).encode('utf-8')
else:
return pd.concat([final_results, invalid]).reset_index(drop=True).to_csv(index=False).encode('utf-8')