Spaces:

dfinel
/

deployment_final_project

Sleeping

App Files Files Community

deployment_final_project / bert_regression.py

dfinel

Update bert_regression.py

c32bcca verified 3 months ago

raw history blame

No virus

2.79 kB

	import pandas as pd
	import numpy as np
	from datasets import Dataset
	from transformers import AutoTokenizer,AutoModelForSequenceClassification
	import torch
	from itertools import chain
	import re
	def remove_links(review):
	pattern = r'\bhttps?://\S+'
	return re.sub(pattern, '', review)


	# df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv')
	# df = df.loc[:,['Text']].iloc[:1000]
	# df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
	# df['Text'] = df['Text'].apply(remove_links)

	model = AutoModelForSequenceClassification.from_pretrained(
	'bert_regr_other_pretrained', num_labels = 1)
	tokenizer = AutoTokenizer.from_pretrained(
	'bert_regr_other_pretrained')

	def preprocess_function_regr(examples):
	return tokenizer(examples["Text"], truncation=True, max_length=64, padding = 'max_length')

	def get_predictions(reviews):
	#new_test = pd.DataFrame(reviews)
	new_ds_regr = Dataset.from_pandas(reviews)
	new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text'])
	input_ids = torch.tensor(new_ds_regr_tok['input_ids'])
	token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids'])
	attention_mask = torch.tensor(new_ds_regr_tok['attention_mask'])
	with torch.no_grad():
	outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
	predictions = outputs.logits
	return predictions

	def get_ratings_perc(reviews):
	preds = get_predictions(reviews)
	predictions_list = list(chain.from_iterable(preds.tolist()))
	predictions_array = np.clip(predictions_list,1,5)
	predictions_array = [round(x) for x in predictions_array]
	sum = np.unique(predictions_array, return_counts = True)[1].sum()
	ratings_perc = np.unique(predictions_array, return_counts = True)[1]/sum *100
	return ratings_perc

	def get_ratings_dic(reviews):
	ratings_perc = get_ratings_perc(reviews)
	dic = {}
	for i in range(1,6):
	dic[i] = f'{ratings_perc[i-1].round(2)} %'
	return dic

	#print(get_ratings_dic(df))




	# new_test = pd.DataFrame(df.loc[:,'Text'].iloc[:1000])
	# new_ds_regr = Dataset.from_pandas(new_test)
	# new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text'])
	#
	# input_ids = torch.tensor(new_ds_regr_tok['input_ids'])
	# token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids'])
	# attention_mask = torch.tensor(new_ds_regr_tok['attention_mask'])
	# with torch.no_grad():
	# outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
	# predictions = outputs.logits
	#
	# predictions_list = list(chain.from_iterable(predictions.tolist()))
	# predictions_array = np.clip(predictions_list,1,5)
	# predictions_array = [round(x) for x in predictions_array]
	# print(np.unique(predictions_array, return_counts = True))