|
import os.path |
|
|
|
from tokenizers import ByteLevelBPETokenizer, Tokenizer |
|
from typing import Dict, List, Any |
|
from transformers import pipeline, PretrainedConfig |
|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
from huggingface_hub import PyTorchModelHubMixin |
|
|
|
device = "cpu" |
|
embedding_dim = 128 |
|
rnn_units = 256 |
|
vocab_size = 8000 |
|
|
|
|
|
def clean_text_line(line): |
|
line = line.strip() |
|
line = line.lower() |
|
line = line.replace("'", "") |
|
line = line.replace('"', "") |
|
line = line.replace(",", "") |
|
line = line.replace(".", "") |
|
line = line.replace("?", "") |
|
line = line.replace("!", "") |
|
line = line.replace(":", "") |
|
line = line.replace(";", "") |
|
line = line.replace("(", "") |
|
line = line.replace(")", "") |
|
return line |
|
|
|
|
|
def clean_text(text_lines): |
|
if isinstance(text_lines, list): |
|
cleaned_lines = [] |
|
for line in text_lines: |
|
|
|
line = clean_text_line(line) |
|
cleaned_lines.append(line) |
|
return cleaned_lines |
|
else: |
|
return clean_text_line(text_lines) |
|
|
|
|
|
def spelling_error_rate(text_generated, vocabulary): |
|
|
|
words = text_generated.split() |
|
words = clean_text(words) |
|
total_words = len(words) |
|
misspelled_words = [word for word in words if word.lower() not in vocabulary] |
|
|
|
if total_words == 0: |
|
return 0 |
|
error_rate = len(misspelled_words) / total_words |
|
return error_rate |
|
|
|
|
|
def generate_text_bpe( |
|
model, |
|
start_string, |
|
generation_length=1000, |
|
top_k=20, |
|
temperature=1.0, |
|
vocabulary=None, |
|
tokenizer=None, |
|
): |
|
|
|
|
|
input_ids = tokenizer.encode(start_string).ids |
|
input_eval = torch.tensor([input_ids], device=device) |
|
|
|
|
|
text_generated = [] |
|
|
|
|
|
state_h, state_c = model.init_state(1) |
|
|
|
model.eval() |
|
|
|
with torch.no_grad(): |
|
for i in range(generation_length): |
|
output, (state_h, state_c) = model(input_eval, (state_h, state_c)) |
|
|
|
|
|
logits = output[0, -1] / temperature |
|
|
|
probabilities = torch.nn.functional.softmax(logits, dim=0).cpu().numpy() |
|
|
|
|
|
sorted_indices = np.argsort(probabilities)[-top_k:] |
|
top_probabilities = probabilities[sorted_indices] |
|
top_probabilities /= np.sum(top_probabilities) |
|
|
|
predicted_id = np.random.choice(sorted_indices, p=top_probabilities) |
|
|
|
|
|
input_eval = torch.tensor([[predicted_id]], device=device) |
|
|
|
|
|
predicted_text = tokenizer.decode([predicted_id]) |
|
|
|
|
|
text_generated.append(predicted_text) |
|
|
|
generated_text = start_string + "".join(text_generated) |
|
error_rate = spelling_error_rate(generated_text, vocabulary) |
|
return error_rate, generated_text |
|
|
|
|
|
def load_vocabulary_from_file(file_path): |
|
vocabulary = set() |
|
with open(file_path, "r", encoding="utf-8") as file: |
|
for line in file: |
|
word = line.strip() |
|
if word: |
|
vocabulary.add( |
|
word.lower() |
|
) |
|
return vocabulary |
|
|
|
|
|
def get_model(): |
|
class AurelioRNN(nn.Module, PyTorchModelHubMixin): |
|
def __init__(self, config: dict): |
|
super().__init__() |
|
self.config = PretrainedConfig() |
|
self.config.vocab_size = config.get("vocab_size") |
|
self.config.embedding_dim = config.get("embedding_dim") |
|
self.config.rnn_units = config.get("rnn_units") |
|
self.embedding = nn.Embedding(vocab_size, embedding_dim) |
|
self.lstm = nn.LSTM(embedding_dim, rnn_units, batch_first=True) |
|
self.fc = nn.Linear(rnn_units, vocab_size) |
|
|
|
def forward(self, x, state): |
|
x = self.embedding(x) |
|
x, state = self.lstm(x, state) |
|
x = self.fc(x) |
|
return x, state |
|
|
|
def init_state(self, batch_size): |
|
return ( |
|
torch.zeros(1, batch_size, rnn_units).to("cpu"), |
|
torch.zeros(1, batch_size, rnn_units).to("cpu"), |
|
) |
|
|
|
return AurelioRNN |
|
|
|
|
|
class EndpointHandler: |
|
def __init__(self, path=""): |
|
|
|
lstm = get_model() |
|
config = { |
|
"vocab_size": vocab_size, |
|
"embedding_dim": embedding_dim, |
|
"rnn_units": rnn_units, |
|
} |
|
self.model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config) |
|
|
|
dir_path = os.path.abspath(os.path.dirname(__file__)) |
|
self.tokenizer = ByteLevelBPETokenizer( |
|
os.path.join(dir_path, "aurelio_bpe-vocab.json"), |
|
os.path.join(dir_path, "aurelio_bpe-merges.txt"), |
|
) |
|
|
|
|
|
|
|
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
|
""" |
|
Args: |
|
data (:obj:): |
|
includes the input data and the parameters for the inference. |
|
Return: |
|
A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing : |
|
- "label": A string representing what the label/class is. There can be multiple labels. |
|
- "score": A score between 0 and 1 describing how confident the model is for this label/class. |
|
""" |
|
print("data", data) |
|
inputs = data.pop("inputs", data) |
|
start_string = inputs[0] |
|
config = { |
|
"vocab_size": vocab_size, |
|
"embedding_dim": embedding_dim, |
|
"rnn_units": rnn_units, |
|
} |
|
|
|
lstm = get_model() |
|
model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config) |
|
model.eval() |
|
|
|
dir_path = os.path.abspath(os.path.dirname(__file__)) |
|
|
|
kapampangan_vocabulary = load_vocabulary_from_file(os.path.join(dir_path, "kapampangan.txt")) |
|
|
|
seq_length = 64 |
|
|
|
tokenizer = ByteLevelBPETokenizer( |
|
os.path.join(dir_path, "aurelio_bpe-vocab.json"), |
|
os.path.join(dir_path, "aurelio_bpe-merges.txt"), |
|
) |
|
predictions = [] |
|
|
|
for i in range(10): |
|
error_rate, generated_text = generate_text_bpe( |
|
model, |
|
start_string=start_string, |
|
generation_length=seq_length, |
|
temperature=1.2, |
|
top_k=20, |
|
vocabulary=kapampangan_vocabulary, |
|
tokenizer=tokenizer, |
|
) |
|
predictions.append(generated_text) |
|
|
|
|
|
return predictions |
|
|