deployment_final_project / bert_regression.py
dfinel's picture
Update bert_regression.py
c32bcca verified
raw history blame
No virus
2.79 kB
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import torch
from itertools import chain
import re
def remove_links(review):
pattern = r'\bhttps?://\S+'
return re.sub(pattern, '', review)
# df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv')
# df = df.loc[:,['Text']].iloc[:1000]
# df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
# df['Text'] = df['Text'].apply(remove_links)
model = AutoModelForSequenceClassification.from_pretrained(
'bert_regr_other_pretrained', num_labels = 1)
tokenizer = AutoTokenizer.from_pretrained(
'bert_regr_other_pretrained')
def preprocess_function_regr(examples):
return tokenizer(examples["Text"], truncation=True, max_length=64, padding = 'max_length')
def get_predictions(reviews):
#new_test = pd.DataFrame(reviews)
new_ds_regr = Dataset.from_pandas(reviews)
new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text'])
input_ids = torch.tensor(new_ds_regr_tok['input_ids'])
token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids'])
attention_mask = torch.tensor(new_ds_regr_tok['attention_mask'])
with torch.no_grad():
outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
predictions = outputs.logits
return predictions
def get_ratings_perc(reviews):
preds = get_predictions(reviews)
predictions_list = list(chain.from_iterable(preds.tolist()))
predictions_array = np.clip(predictions_list,1,5)
predictions_array = [round(x) for x in predictions_array]
sum = np.unique(predictions_array, return_counts = True)[1].sum()
ratings_perc = np.unique(predictions_array, return_counts = True)[1]/sum *100
return ratings_perc
def get_ratings_dic(reviews):
ratings_perc = get_ratings_perc(reviews)
dic = {}
for i in range(1,6):
dic[i] = f'{ratings_perc[i-1].round(2)} %'
return dic
#print(get_ratings_dic(df))
# new_test = pd.DataFrame(df.loc[:,'Text'].iloc[:1000])
# new_ds_regr = Dataset.from_pandas(new_test)
# new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text'])
#
# input_ids = torch.tensor(new_ds_regr_tok['input_ids'])
# token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids'])
# attention_mask = torch.tensor(new_ds_regr_tok['attention_mask'])
# with torch.no_grad():
# outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
# predictions = outputs.logits
#
# predictions_list = list(chain.from_iterable(predictions.tolist()))
# predictions_array = np.clip(predictions_list,1,5)
# predictions_array = [round(x) for x in predictions_array]
# print(np.unique(predictions_array, return_counts = True))