import os.path from tokenizers import ByteLevelBPETokenizer, Tokenizer from typing import Dict, List, Any from transformers import pipeline, PretrainedConfig import numpy as np import torch import torch.nn as nn from huggingface_hub import PyTorchModelHubMixin device = "cpu" embedding_dim = 128 rnn_units = 256 vocab_size = 8000 def clean_text_line(line): line = line.strip() # Remove leading and trailing whitespace line = line.lower() # Lowercase the text if your model expects lowercase input line = line.replace("'", "") line = line.replace('"', "") line = line.replace(",", "") line = line.replace(".", "") line = line.replace("?", "") line = line.replace("!", "") line = line.replace(":", "") line = line.replace(";", "") line = line.replace("(", "") line = line.replace(")", "") return line def clean_text(text_lines): if isinstance(text_lines, list): cleaned_lines = [] for line in text_lines: # Example cleaning steps, adjust these based on your specific needs line = clean_text_line(line) cleaned_lines.append(line) return cleaned_lines else: return clean_text_line(text_lines) def spelling_error_rate(text_generated, vocabulary): # remove double quotes words = text_generated.split() words = clean_text(words) total_words = len(words) misspelled_words = [word for word in words if word.lower() not in vocabulary] if total_words == 0: return 0 error_rate = len(misspelled_words) / total_words return error_rate def generate_text_bpe( model, start_string, generation_length=1000, top_k=20, temperature=1.0, vocabulary=None, tokenizer=None, ): # Encode the start string to token IDs input_ids = tokenizer.encode(start_string).ids input_eval = torch.tensor([input_ids], device=device) # Empty string to store the results text_generated = [] # Initialize hidden state state_h, state_c = model.init_state(1) model.eval() # Evaluation mode with torch.no_grad(): for i in range(generation_length): output, (state_h, state_c) = model(input_eval, (state_h, state_c)) # Apply temperature scaling logits = output[0, -1] / temperature probabilities = torch.nn.functional.softmax(logits, dim=0).cpu().numpy() # Apply top-k sampling sorted_indices = np.argsort(probabilities)[-top_k:] top_probabilities = probabilities[sorted_indices] top_probabilities /= np.sum(top_probabilities) # Normalize probabilities predicted_id = np.random.choice(sorted_indices, p=top_probabilities) # Pass the predicted token ID as the next input to the model input_eval = torch.tensor([[predicted_id]], device=device) # Decode the predicted token ID to text predicted_text = tokenizer.decode([predicted_id]) # Append the predicted text to the generated text text_generated.append(predicted_text) generated_text = start_string + "".join(text_generated) error_rate = spelling_error_rate(generated_text, vocabulary) return error_rate, generated_text def load_vocabulary_from_file(file_path): vocabulary = set() with open(file_path, "r", encoding="utf-8") as file: for line in file: word = line.strip() # Remove any leading/trailing whitespace if word: # Ensure the line is not empty vocabulary.add( word.lower() ) # Add the word in lowercase to ensure consistency return vocabulary def get_model(): class AurelioRNN(nn.Module, PyTorchModelHubMixin): def __init__(self, config: dict): super().__init__() self.config = PretrainedConfig() self.config.vocab_size = config.get("vocab_size") self.config.embedding_dim = config.get("embedding_dim") self.config.rnn_units = config.get("rnn_units") self.embedding = nn.Embedding(vocab_size, embedding_dim) self.lstm = nn.LSTM(embedding_dim, rnn_units, batch_first=True) self.fc = nn.Linear(rnn_units, vocab_size) def forward(self, x, state): x = self.embedding(x) x, state = self.lstm(x, state) x = self.fc(x) return x, state def init_state(self, batch_size): return ( torch.zeros(1, batch_size, rnn_units).to("cpu"), torch.zeros(1, batch_size, rnn_units).to("cpu"), ) return AurelioRNN class EndpointHandler: def __init__(self, path=""): # load the optimized model lstm = get_model() config = { "vocab_size": vocab_size, "embedding_dim": embedding_dim, "rnn_units": rnn_units, } self.model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config) dir_path = os.path.abspath(os.path.dirname(__file__)) self.tokenizer = ByteLevelBPETokenizer( os.path.join(dir_path, "aurelio_bpe-vocab.json"), os.path.join(dir_path, "aurelio_bpe-merges.txt"), ) # create inference pipeline def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Args: data (:obj:): includes the input data and the parameters for the inference. Return: A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing : - "label": A string representing what the label/class is. There can be multiple labels. - "score": A score between 0 and 1 describing how confident the model is for this label/class. """ print("data", data) inputs = data.pop("inputs", data) start_string = inputs[0] config = { "vocab_size": vocab_size, "embedding_dim": embedding_dim, "rnn_units": rnn_units, } # load_from_hub lstm = get_model() model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config) model.eval() # Set the model to evaluation mode dir_path = os.path.abspath(os.path.dirname(__file__)) # Load the Kapampangan vocabulary kapampangan_vocabulary = load_vocabulary_from_file(os.path.join(dir_path, "kapampangan.txt")) # Define the source and destination paths seq_length = 64 tokenizer = ByteLevelBPETokenizer( os.path.join(dir_path, "aurelio_bpe-vocab.json"), os.path.join(dir_path, "aurelio_bpe-merges.txt"), ) predictions = [] # Generate 10 samples for i in range(10): error_rate, generated_text = generate_text_bpe( model, start_string=start_string, generation_length=seq_length, temperature=1.2, top_k=20, vocabulary=kapampangan_vocabulary, tokenizer=tokenizer, ) predictions.append(generated_text) # return preductions as concated string return predictions