aurelio-rnn / handler.py

fixed return value

8a1f10f 5 months ago

No virus

8.9 kB

	import os.path

	from tokenizers import ByteLevelBPETokenizer, Tokenizer
	from typing import Dict, List, Any
	from transformers import pipeline, PretrainedConfig
	import numpy as np
	import torch
	import torch.nn as nn
	from huggingface_hub import PyTorchModelHubMixin

	device = "cpu"
	embedding_dim = 128
	rnn_units = 256
	vocab_size = 8000


	def clean_text_line(line):
	line = line.strip() # Remove leading and trailing whitespace
	line = line.lower() # Lowercase the text if your model expects lowercase input
	line = line.replace("'", "")
	line = line.replace('"', "")
	line = line.replace(",", "")
	line = line.replace(".", "")
	line = line.replace("?", "")
	line = line.replace("!", "")
	line = line.replace(":", "")
	line = line.replace(";", "")
	line = line.replace("(", "")
	line = line.replace(")", "")
	return line


	def clean_text(text_lines):
	if isinstance(text_lines, list):
	cleaned_lines = []
	for line in text_lines:
	# Example cleaning steps, adjust these based on your specific needs
	line = clean_text_line(line)
	cleaned_lines.append(line)
	return cleaned_lines
	else:
	return clean_text_line(text_lines)


	def spelling_error_rate(text_generated, vocabulary):
	# remove double quotes
	words = text_generated.split()
	words = clean_text(words)
	total_words = len(words)
	misspelled_words = [word for word in words if word.lower() not in vocabulary]

	if total_words == 0:
	return 0
	error_rate = len(misspelled_words) / total_words
	return error_rate


	def generate_text_bpe(
	model,
	start_string,
	generation_length=1000,
	top_k=20,
	temperature=1.0,
	vocabulary=None,
	tokenizer=None,
	):

	# Encode the start string to token IDs
	input_ids = tokenizer.encode(start_string).ids
	input_eval = torch.tensor([input_ids], device=device)

	# Empty string to store the results
	text_generated = []

	# Initialize hidden state
	state_h, state_c = model.init_state(1)

	model.eval() # Evaluation mode

	with torch.no_grad():
	for i in range(generation_length):
	output, (state_h, state_c) = model(input_eval, (state_h, state_c))

	# Apply temperature scaling
	logits = output[0, -1] / temperature

	probabilities = torch.nn.functional.softmax(logits, dim=0).cpu().numpy()

	# Apply top-k sampling
	sorted_indices = np.argsort(probabilities)[-top_k:]
	top_probabilities = probabilities[sorted_indices]
	top_probabilities /= np.sum(top_probabilities) # Normalize probabilities

	predicted_id = np.random.choice(sorted_indices, p=top_probabilities)

	# Pass the predicted token ID as the next input to the model
	input_eval = torch.tensor([[predicted_id]], device=device)

	# Decode the predicted token ID to text
	predicted_text = tokenizer.decode([predicted_id])

	# Append the predicted text to the generated text
	text_generated.append(predicted_text)

	generated_text = start_string + "".join(text_generated)
	error_rate = spelling_error_rate(generated_text, vocabulary)
	return error_rate, generated_text


	def load_vocabulary_from_file(file_path):
	vocabulary = set()
	with open(file_path, "r", encoding="utf-8") as file:
	for line in file:
	word = line.strip() # Remove any leading/trailing whitespace
	if word: # Ensure the line is not empty
	vocabulary.add(
	word.lower()
	) # Add the word in lowercase to ensure consistency
	return vocabulary


	def get_model():
	class AurelioRNN(nn.Module, PyTorchModelHubMixin):
	def __init__(self, config: dict):
	super().__init__()
	self.config = PretrainedConfig()
	self.config.vocab_size = config.get("vocab_size")
	self.config.embedding_dim = config.get("embedding_dim")
	self.config.rnn_units = config.get("rnn_units")
	self.embedding = nn.Embedding(vocab_size, embedding_dim)
	self.lstm = nn.LSTM(embedding_dim, rnn_units, batch_first=True)
	self.fc = nn.Linear(rnn_units, vocab_size)

	def forward(self, x, state):
	x = self.embedding(x)
	x, state = self.lstm(x, state)
	x = self.fc(x)
	return x, state

	def init_state(self, batch_size):
	return (
	torch.zeros(1, batch_size, rnn_units).to("cpu"),
	torch.zeros(1, batch_size, rnn_units).to("cpu"),
	)

	return AurelioRNN


	def calculate_perplexity_on_text(model, text, seq_length, tokenizer):
	loss_fn = nn.CrossEntropyLoss()

	model.eval()
	total_loss = 0
	total_words = 0

	# Tokenize the text
	encoded = tokenizer.encode(text)
	ids = encoded.ids
	if len(ids) <= seq_length:
	print(
	"Input text is too short to calculate perplexity. length:",
	len(ids),
	"seq_length:",
	seq_length,
	)
	return float(
	"inf"
	)

	inputs = [ids[i : i + seq_length] for i in range(len(ids) - seq_length)]
	targets = [ids[i + 1 : i + seq_length + 1] for i in range(len(ids) - seq_length)]

	state_h, state_c = model.init_state(1)

	with torch.no_grad():
	for i in range(len(inputs)):
	input_tensor = torch.tensor(inputs[i]).unsqueeze(0).to(device)
	target_tensor = torch.tensor(targets[i]).unsqueeze(0).to(device)

	output, (state_h, state_c) = model(
	input_tensor, (state_h.detach(), state_c.detach())
	)
	loss = loss_fn(output.transpose(1, 2), target_tensor)
	total_loss += loss.item()
	total_words += seq_length

	average_loss = total_loss / total_words

	perplexity = np.exp(average_loss)
	return perplexity



	class EndpointHandler:
	def __init__(self, path=""):
	# load the optimized model
	lstm = get_model()
	config = {
	"vocab_size": vocab_size,
	"embedding_dim": embedding_dim,
	"rnn_units": rnn_units,
	}
	self.model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config)

	dir_path = os.path.abspath(os.path.dirname(__file__))
	self.tokenizer = ByteLevelBPETokenizer(
	os.path.join(dir_path, "aurelio_bpe-vocab.json"),
	os.path.join(dir_path, "aurelio_bpe-merges.txt"),
	)
	# create inference pipeline


	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Args:
	data (:obj:):
	includes the input data and the parameters for the inference.
	Return:
	A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
	- "label": A string representing what the label/class is. There can be multiple labels.
	- "score": A score between 0 and 1 describing how confident the model is for this label/class.
	"""
	print("data", data)
	inputs = data.pop("inputs", data)
	start_string = inputs[0]
	config = {
	"vocab_size": vocab_size,
	"embedding_dim": embedding_dim,
	"rnn_units": rnn_units,
	}
	lstm = get_model()
	model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config)
	model.eval() # Set the model to evaluation mode

	dir_path = os.path.abspath(os.path.dirname(__file__))
	# Load the Kapampangan vocabulary
	kapampangan_vocabulary = load_vocabulary_from_file(os.path.join(dir_path, "kapampangan.txt"))
	seq_length = 64

	tokenizer = ByteLevelBPETokenizer(
	os.path.join(dir_path, "aurelio_bpe-vocab.json"),
	os.path.join(dir_path, "aurelio_bpe-merges.txt"),
	)
	predictions = []
	# Generate 10 samples
	for i in range(10):
	error_rate, generated_text = generate_text_bpe(
	model,
	start_string=start_string,
	generation_length=seq_length,
	temperature=1.2,
	top_k=20,
	vocabulary=kapampangan_vocabulary,
	tokenizer=tokenizer,
	)
	perplexity = calculate_perplexity_on_text(
	model, generated_text, seq_length=seq_length - 1, tokenizer=tokenizer
	)
	predictions.append(
	{
	"label": error_rate,
	"score": 1 - error_rate,
	"generated_text": generated_text,
	"perplexity": perplexity
	}
	)

	return predictions