Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from datasets import Dataset | |
from transformers import AutoTokenizer,AutoModelForSequenceClassification | |
import torch | |
from itertools import chain | |
import re | |
def remove_links(review): | |
pattern = r'\bhttps?://\S+' | |
return re.sub(pattern, '', review) | |
# df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv') | |
# df = df.loc[:,['Text']].iloc[:1000] | |
# df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True) | |
# df['Text'] = df['Text'].apply(remove_links) | |
model = AutoModelForSequenceClassification.from_pretrained( | |
'bert_regr_other_pretrained', num_labels = 1) | |
tokenizer = AutoTokenizer.from_pretrained( | |
'bert_regr_other_pretrained') | |
def preprocess_function_regr(examples): | |
return tokenizer(examples["Text"], truncation=True, max_length=64, padding = 'max_length') | |
def get_predictions(reviews): | |
#new_test = pd.DataFrame(reviews) | |
new_ds_regr = Dataset.from_pandas(reviews) | |
new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text']) | |
input_ids = torch.tensor(new_ds_regr_tok['input_ids']) | |
token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids']) | |
attention_mask = torch.tensor(new_ds_regr_tok['attention_mask']) | |
with torch.no_grad(): | |
outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) | |
predictions = outputs.logits | |
return predictions | |
def get_ratings_perc(reviews): | |
preds = get_predictions(reviews) | |
predictions_list = list(chain.from_iterable(preds.tolist())) | |
predictions_array = np.clip(predictions_list,1,5) | |
predictions_array = [round(x) for x in predictions_array] | |
sum = np.unique(predictions_array, return_counts = True)[1].sum() | |
ratings_perc = np.unique(predictions_array, return_counts = True)[1]/sum *100 | |
return ratings_perc | |
def get_ratings_dic(reviews): | |
ratings_perc = get_ratings_perc(reviews) | |
dic = {} | |
for i in range(1,6): | |
dic[i] = f'{ratings_perc[i-1].round(2)} %' | |
return dic | |
#print(get_ratings_dic(df)) | |
# new_test = pd.DataFrame(df.loc[:,'Text'].iloc[:1000]) | |
# new_ds_regr = Dataset.from_pandas(new_test) | |
# new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text']) | |
# | |
# input_ids = torch.tensor(new_ds_regr_tok['input_ids']) | |
# token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids']) | |
# attention_mask = torch.tensor(new_ds_regr_tok['attention_mask']) | |
# with torch.no_grad(): | |
# outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) | |
# predictions = outputs.logits | |
# | |
# predictions_list = list(chain.from_iterable(predictions.tolist())) | |
# predictions_array = np.clip(predictions_list,1,5) | |
# predictions_array = [round(x) for x in predictions_array] | |
# print(np.unique(predictions_array, return_counts = True)) |