import os.path from tokenizers import ByteLevelBPETokenizer, Tokenizer from typing import Dict, List, Any from transformers import pipeline, PretrainedConfig import numpy as np import torch import torch.nn as nn from huggingface_hub import PyTorchModelHubMixin device = "cpu" embedding_dim = 128 rnn_units = 256 vocab_size = 8000 def clean_text_line(line): line = line.strip() # Remove leading and trailing whitespace line = line.lower() # Lowercase the text if your model expects lowercase input line = line.replace("'", "") line = line.replace('"', "") line = line.replace(",", "") line = line.replace(".", "") line = line.replace("?", "") line = line.replace("!", "") line = line.replace(":", "") line = line.replace(";", "") line = line.replace("(", "") line = line.replace(")", "") return line def clean_text(text_lines): if isinstance(text_lines, list): cleaned_lines = [] for line in text_lines: # Example cleaning steps, adjust these based on your specific needs line = clean_text_line(line) cleaned_lines.append(line) return cleaned_lines else: return clean_text_line(text_lines) def spelling_error_rate(text_generated, vocabulary): # remove double quotes words = text_generated.split() words = clean_text(words) total_words = len(words) misspelled_words = [word for word in words if word.lower() not in vocabulary] if total_words == 0: return 0 error_rate = len(misspelled_words) / total_words return error_rate def generate_text_bpe( model, start_string, generation_length=1000, top_k=20, temperature=1.0, vocabulary=None, tokenizer=None, ): # Encode the start string to token IDs input_ids = tokenizer.encode(start_string).ids input_eval = torch.tensor([input_ids], device=device) # Empty string to store the results text_generated = [] # Initialize hidden state state_h, state_c = model.init_state(1) model.eval() # Evaluation mode with torch.no_grad(): for i in range(generation_length): output, (state_h, state_c) = model(input_eval, (state_h, state_c)) # Apply temperature scaling logits = output[0, -1] / temperature probabilities = torch.nn.functional.softmax(logits, dim=0).cpu().numpy() # Apply top-k sampling sorted_indices = np.argsort(probabilities)[-top_k:] top_probabilities = probabilities[sorted_indices] top_probabilities /= np.sum(top_probabilities) # Normalize probabilities predicted_id = np.random.choice(sorted_indices, p=top_probabilities) # Pass the predicted token ID as the next input to the model input_eval = torch.tensor([[predicted_id]], device=device) # Decode the predicted token ID to text predicted_text = tokenizer.decode([predicted_id]) # Append the predicted text to the generated text text_generated.append(predicted_text) generated_text = start_string + "".join(text_generated) error_rate = spelling_error_rate(generated_text, vocabulary) return error_rate, generated_text def load_vocabulary_from_file(file_path): vocabulary = set() with open(file_path, "r", encoding="utf-8") as file: for line in file: word = line.strip() # Remove any leading/trailing whitespace if word: # Ensure the line is not empty vocabulary.add( word.lower() ) # Add the word in lowercase to ensure consistency return vocabulary def get_model(): class AurelioRNN(nn.Module, PyTorchModelHubMixin): def __init__(self, config: dict): super().__init__() self.config = PretrainedConfig() self.config.vocab_size = config.get("vocab_size") self.config.embedding_dim = config.get("embedding_dim") self.config.rnn_units = config.get("rnn_units") self.embedding = nn.Embedding(vocab_size, embedding_dim) self.lstm = nn.LSTM(embedding_dim, rnn_units, batch_first=True) self.fc = nn.Linear(rnn_units, vocab_size) def forward(self, x, state): x = self.embedding(x) x, state = self.lstm(x, state) x = self.fc(x) return x, state def init_state(self, batch_size): return ( torch.zeros(1, batch_size, rnn_units).to("cpu"), torch.zeros(1, batch_size, rnn_units).to("cpu"), ) return AurelioRNN def calculate_perplexity_on_text(model, text, seq_length, tokenizer): loss_fn = nn.CrossEntropyLoss() model.eval() total_loss = 0 total_words = 0 # Tokenize the text encoded = tokenizer.encode(text) ids = encoded.ids if len(ids) <= seq_length: print( "Input text is too short to calculate perplexity. length:", len(ids), "seq_length:", seq_length, ) return float( "inf" ) inputs = [ids[i : i + seq_length] for i in range(len(ids) - seq_length)] targets = [ids[i + 1 : i + seq_length + 1] for i in range(len(ids) - seq_length)] state_h, state_c = model.init_state(1) with torch.no_grad(): for i in range(len(inputs)): input_tensor = torch.tensor(inputs[i]).unsqueeze(0).to(device) target_tensor = torch.tensor(targets[i]).unsqueeze(0).to(device) output, (state_h, state_c) = model( input_tensor, (state_h.detach(), state_c.detach()) ) loss = loss_fn(output.transpose(1, 2), target_tensor) total_loss += loss.item() total_words += seq_length average_loss = total_loss / total_words perplexity = np.exp(average_loss) return perplexity class EndpointHandler: def __init__(self, path=""): # load the optimized model lstm = get_model() config = { "vocab_size": vocab_size, "embedding_dim": embedding_dim, "rnn_units": rnn_units, } self.model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config) dir_path = os.path.abspath(os.path.dirname(__file__)) self.tokenizer = ByteLevelBPETokenizer( os.path.join(dir_path, "aurelio_bpe-vocab.json"), os.path.join(dir_path, "aurelio_bpe-merges.txt"), ) # create inference pipeline def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Args: data (:obj:): includes the input data and the parameters for the inference. Return: A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing : - "label": A string representing what the label/class is. There can be multiple labels. - "score": A score between 0 and 1 describing how confident the model is for this label/class. """ print("data", data) inputs = data.pop("inputs", data) start_string = inputs[0] config = { "vocab_size": vocab_size, "embedding_dim": embedding_dim, "rnn_units": rnn_units, } lstm = get_model() model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config) model.eval() # Set the model to evaluation mode dir_path = os.path.abspath(os.path.dirname(__file__)) # Load the Kapampangan vocabulary kapampangan_vocabulary = load_vocabulary_from_file(os.path.join(dir_path, "kapampangan.txt")) seq_length = 64 tokenizer = ByteLevelBPETokenizer( os.path.join(dir_path, "aurelio_bpe-vocab.json"), os.path.join(dir_path, "aurelio_bpe-merges.txt"), ) predictions = [] # Generate 10 samples for i in range(10): error_rate, generated_text = generate_text_bpe( model, start_string=start_string, generation_length=seq_length, temperature=1.2, top_k=20, vocabulary=kapampangan_vocabulary, tokenizer=tokenizer, ) perplexity = calculate_perplexity_on_text( model, generated_text, seq_length=seq_length - 1, tokenizer=tokenizer ) predictions.append( { "label": error_rate, "score": 1 - error_rate, "generated_text": generated_text, "perplexity": perplexity } ) return predictions