Spaces:

cheesexuebao
/

murphy

Sleeping

App Files Files Community

murphy / Prediction.py

cheesexuebao

Load model from hub

afc3372 7 months ago

raw

history blame

No virus

2.79 kB

	import pandas as pd
	from tqdm.auto import tqdm
	import torch
	from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
	import os
	import glob


	RANDOM_SEED = 42
	pd.RANDOM_SEED = 42
	LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone", "None"]


	@torch.no_grad()
	def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128):
	predictions = []
	post = data[text_col]
	num_text = len(post)
	generator = range(0, num_text, text_bs)
	for i in tqdm(generator, total=len(generator), desc="Processing..."):
	texts = post[i: min(num_text, i+text_bs)].tolist()
	encoding = tokenizer(
	texts,
	add_special_tokens=True,
	max_length=max_token_len,
	return_token_type_ids=False,
	padding="max_length",
	truncation=True,
	return_attention_mask=True,
	return_tensors='pt',
	)
	logits = model(
	encoding["input_ids"].to(device),
	encoding["attention_mask"].to(device),
	return_dict=True
	).logits
	prediction = torch.softmax(logits, dim=1)
	predictions.append(prediction.detach().cpu())

	final_pred = torch.cat(predictions, dim=0)
	y_inten = final_pred.numpy().T

	for i in range(len(LABEL_COLUMNS)):
	data[LABEL_COLUMNS[i]] = [round(i, 8) for i in y_inten[i].tolist()]
	return data

	@torch.no_grad()
	def predict_single(sentence, tokenizer, model, device, max_token_len=128):
	encoding = tokenizer(
	sentence,
	add_special_tokens=True,
	max_length=max_token_len,
	return_token_type_ids=False,
	padding="max_length",
	truncation=True,
	return_attention_mask=True,
	return_tensors='pt',
	)
	logits = model(
	encoding["input_ids"].to(device),
	encoding["attention_mask"].to(device),
	return_dict=True
	).logits
	prediction = torch.softmax(logits, dim=1)
	y_inten = prediction.flatten().cpu().numpy().T.tolist()
	y_inten = [round(i, 8) for i in y_inten]
	return y_inten



	if __name__ == "__main__":

	Data = pd.read_csv("assets/Kickstarter_sentence_level_5000.csv")
	Data = Data[:20]
	device = torch.device('cpu')

	# Load model directly
	tokenizer = BertTokenizer.from_pretrained("Oliver12315/Brand_Tone_of_Voice")
	model = BertForSequenceClassification.from_pretrained("Oliver12315/Brand_Tone_of_Voice")
	model = model.to(device)
	fk_doc_result = predict_csv(Data,"content", tokenizer, model, device)
	single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device)
	fk_doc_result.to_csv(f"output/prediction_Brand_Tone_of_Voice.csv")