Spaces:

hongaik
/

service_text_classification

Runtime error

App Files Files Community

service_text_classification / utils.py

hongaik

updated code

a27a834 about 2 years ago

raw history blame contribute delete

No virus

4.05 kB

	import re
	from gensim.models.keyedvectors import KeyedVectors
	from transformers import pipeline
	import pickle
	import numpy as np
	import pandas as pd

	w2v = KeyedVectors.load('models/word2vec')
	w2v_vocab = set(sorted(w2v.index_to_key))
	model = pickle.load(open('models/w2v_ovr_svc.sav', 'rb'))
	classifier = pipeline("zero-shot-classification",
	model="facebook/bart-large-mnli", framework='pt'
	)

	labels = [
	'communication', 'waiting time',
	'information', 'user interface',
	'facilities', 'location', 'price'
	]

	sample_file = pd.read_csv('sample.csv').to_csv(index=False).encode('utf-8')

	print('utils imported!')

	def get_sentiment_label_facebook(list_of_sent_dicts):
	if list_of_sent_dicts['labels'][0] == 'negative':
	return 'negative'
	else:
	return 'positive'

	def get_single_prediction(text):

	# manipulate data into a format that we pass to our model
	text = text.lower() #lower case
	text = re.sub('[^0-9a-zA-Z\s]', '', text) #remove special char, punctuation

	# Remove OOV words
	text = ' '.join([i for i in text.split() if i in w2v_vocab])

	# Vectorise text and store in new dataframe. Sentence vector = average of word vectors
	text_vectors = np.mean([w2v[i] for i in text.split()], axis=0)

	# Make predictions
	results = model.predict_proba(text_vectors.reshape(1,300)).squeeze().round(2)
	pred_prob = pd.DataFrame({'topic': labels, 'probability': results}).sort_values('probability', ascending=True)

	# Get sentiment
	sentiment_results = classifier(text,
	candidate_labels=['positive', 'negative'],
	hypothesis_template='The sentiment of this is {}')
	sentiment_prob = pd.DataFrame({'sentiment': sentiment_results['labels'], 'probability': sentiment_results['scores']})

	return (pred_prob, sentiment_prob)

	def get_multiple_predictions(csv):

	df = pd.read_csv(csv)
	df.columns = ['sequence']

	df['sequence_clean'] = df['sequence'].str.lower() #lower case
	df['sequence_clean'] = df['sequence_clean'].str.strip()
	df['sequence_clean'] = df['sequence_clean'].str.replace('[^0-9a-zA-Z\s]','') #remove special char, punctuation

	# Remove OOV words
	df['sequence_clean'] = df['sequence_clean'].apply(lambda x: ' '.join([i for i in x.split() if i in w2v_vocab]))

	# Remove rows with blank string
	invalid = df[(pd.isna(df['sequence_clean'])) \| (df['sequence_clean'] == '')]
	invalid.drop(columns=['sequence_clean'], inplace=True)

	# Drop rows with blank string
	df.dropna(inplace=True)
	df = df[df['sequence_clean'] != ''].reset_index(drop=True)

	# Vectorise text and store in new dataframe. Sentence vector = average of word vectors
	series_text_vectors = pd.DataFrame(df['sequence_clean'].apply(lambda x: np.mean([w2v[i] for i in x.split()], axis=0)).values.tolist())

	# Get predictions
	pred_results = pd.DataFrame(model.predict(series_text_vectors), columns = labels)

	# Join back to original sequence
	final_results = df.join(pred_results)
	final_results['others'] = final_results[labels].max(axis=1)
	final_results['others'] = final_results['others'].apply(lambda x: 1 if x == 0 else 0)

	# Get sentiment labels
	final_results['sentiment'] = final_results['sequence_clean'].apply(lambda x: get_sentiment_label_facebook(classifier(x,
	candidate_labels=['positive', 'negative'],
	hypothesis_template='The sentiment of this is {}'))
	)

	final_results.drop(columns=['sequence_clean'], inplace=True)

	# Append invalid rows
	if len(invalid) == 0:
	return final_results.to_csv(index=False).encode('utf-8')
	else:
	return pd.concat([final_results, invalid]).reset_index(drop=True).to_csv(index=False).encode('utf-8')