Spaces:
Runtime error
Runtime error
import pickle | |
import torch | |
import numpy as np | |
import gradio as gr | |
from nltk import word_tokenize, sent_tokenize | |
from scipy.stats import shapiro | |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast | |
model = GPT2LMHeadModel.from_pretrained('gpt2-large').to('cuda') | |
tokenizer: GPT2TokenizerFast = GPT2TokenizerFast.from_pretrained('gpt2-large') | |
with open('model.pkl', 'rb') as f: | |
lr_model = pickle.load(f) | |
def get_perplexity(text: str): | |
tokens = tokenizer(text, return_tensors='pt', truncation=True, return_offsets_mapping=True) | |
inputs = tokens.input_ids.to('cuda') | |
targets = inputs.clone() | |
with torch.no_grad(): | |
outputs = model(inputs, labels=targets) | |
labels = targets.to(outputs.logits.device) | |
# Shift so that tokens < n predict n | |
shift_logits = outputs.logits[..., :-1, :].contiguous() | |
shift_labels = labels[..., 1:].contiguous() | |
perplexities = torch.nn.functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduce=False) | |
output = [] | |
targets = targets.to('cpu')[0].tolist() | |
# tokens = tokenizer.convert_ids_to_tokens(targets) | |
offsets = tokens.offset_mapping[0].tolist() | |
print(perplexities.to('cpu').tolist()) | |
perplexities = perplexities.to('cpu').numpy() | |
perplexities = perplexities / np.max(perplexities) | |
perplexities = perplexities.tolist() | |
print(perplexities) | |
# output.append((text[:offsets[0][1]], 0)) | |
# for offset, p in zip(offsets[1:], perplexities): | |
# output.append((text[offset[0]:offset[1]], p)) | |
# print(type(p)) | |
output.append((text[:tokens.word_to_chars(0)[1]], 0)) | |
for word_id, p in zip(tokens.word_ids()[1:], perplexities): | |
if word_id == len(output): | |
span = tokens.word_to_chars(word_id) | |
output.append((text[span[0]:span[1]], p)) | |
return outputs.loss, output | |
def score_text(text): | |
perplexity, word_perplexities = get_perplexity(text) | |
lengths = [] | |
for sentence in sent_tokenize(text): | |
lengths.append(len(word_tokenize(sentence))) | |
scores = lr_model.predict_proba([[perplexity.item(), np.mean(lengths), np.std(lengths), shapiro(lengths).pvalue if len(lengths) > 2 else 0.5]])[0] | |
return {'Human': scores[0], 'AI': scores[1]}, word_perplexities | |
sample_text = """ | |
The Saturn V is a type of rocket that was developed by NASA in the 1960s to support the Apollo program, which aimed to land humans on the Moon. | |
It remains the most powerful rocket ever built, and its five F-1 engines generated more than 7.5 million pounds of thrust at liftoff. | |
The Saturn V was used for all of the Apollo missions to the Moon, as well as the launch of the Skylab space station. | |
Despite its impressive capabilities, the Saturn V was only used for a brief period of time before being retired in 1973. | |
Nevertheless, it remains a landmark achievement in the history of space exploration and a symbol of human ingenuity and determination.""" | |
demo = gr.Interface(fn=score_text, inputs=[gr.Textbox(label="Text to score", lines=5, value=sample_text)], outputs=[gr.Label(), gr.HighlightedText()] ) | |
demo.launch() |