murphy / Prediction.py
cheesexuebao's picture
Load model from hub
afc3372
raw history blame
No virus
2.79 kB
import pandas as pd
from tqdm.auto import tqdm
import torch
from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
import os
import glob
RANDOM_SEED = 42
pd.RANDOM_SEED = 42
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone", "None"]
@torch.no_grad()
def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128):
predictions = []
post = data[text_col]
num_text = len(post)
generator = range(0, num_text, text_bs)
for i in tqdm(generator, total=len(generator), desc="Processing..."):
texts = post[i: min(num_text, i+text_bs)].tolist()
encoding = tokenizer(
texts,
add_special_tokens=True,
max_length=max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
logits = model(
encoding["input_ids"].to(device),
encoding["attention_mask"].to(device),
return_dict=True
).logits
prediction = torch.softmax(logits, dim=1)
predictions.append(prediction.detach().cpu())
final_pred = torch.cat(predictions, dim=0)
y_inten = final_pred.numpy().T
for i in range(len(LABEL_COLUMNS)):
data[LABEL_COLUMNS[i]] = [round(i, 8) for i in y_inten[i].tolist()]
return data
@torch.no_grad()
def predict_single(sentence, tokenizer, model, device, max_token_len=128):
encoding = tokenizer(
sentence,
add_special_tokens=True,
max_length=max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
logits = model(
encoding["input_ids"].to(device),
encoding["attention_mask"].to(device),
return_dict=True
).logits
prediction = torch.softmax(logits, dim=1)
y_inten = prediction.flatten().cpu().numpy().T.tolist()
y_inten = [round(i, 8) for i in y_inten]
return y_inten
if __name__ == "__main__":
Data = pd.read_csv("assets/Kickstarter_sentence_level_5000.csv")
Data = Data[:20]
device = torch.device('cpu')
# Load model directly
tokenizer = BertTokenizer.from_pretrained("Oliver12315/Brand_Tone_of_Voice")
model = BertForSequenceClassification.from_pretrained("Oliver12315/Brand_Tone_of_Voice")
model = model.to(device)
fk_doc_result = predict_csv(Data,"content", tokenizer, model, device)
single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device)
fk_doc_result.to_csv(f"output/prediction_Brand_Tone_of_Voice.csv")