Spaces:
Runtime error
Runtime error
File size: 3,241 Bytes
cc5ee73 ba02478 cc5ee73 ba02478 884c12c cc5ee73 884c12c cc5ee73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import pickle
import torch
import numpy as np
import gradio as gr
from nltk import word_tokenize, sent_tokenize
import nltk
from scipy.stats import shapiro
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
nltk.download('punkt')
model = GPT2LMHeadModel.from_pretrained('gpt2-large')
tokenizer: GPT2TokenizerFast = GPT2TokenizerFast.from_pretrained('gpt2-large')
with open('model.pkl', 'rb') as f:
lr_model = pickle.load(f)
def get_perplexity(text: str):
tokens = tokenizer(text, return_tensors='pt', truncation=True, return_offsets_mapping=True)
inputs = tokens.input_ids
targets = inputs.clone()
with torch.no_grad():
outputs = model(inputs, labels=targets)
labels = targets.to(outputs.logits.device)
# Shift so that tokens < n predict n
shift_logits = outputs.logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
perplexities = torch.nn.functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduce=False)
output = []
targets = targets.to('cpu')[0].tolist()
# tokens = tokenizer.convert_ids_to_tokens(targets)
offsets = tokens.offset_mapping[0].tolist()
print(perplexities.to('cpu').tolist())
perplexities = perplexities.to('cpu').numpy()
perplexities = perplexities / np.max(perplexities)
perplexities = perplexities.tolist()
print(perplexities)
# output.append((text[:offsets[0][1]], 0))
# for offset, p in zip(offsets[1:], perplexities):
# output.append((text[offset[0]:offset[1]], p))
# print(type(p))
output.append((text[:tokens.word_to_chars(0)[1]], 0))
for word_id, p in zip(tokens.word_ids()[1:], perplexities):
if word_id == len(output):
span = tokens.word_to_chars(word_id)
output.append((text[span[0]:span[1]], p))
return outputs.loss, output
def score_text(text):
perplexity, word_perplexities = get_perplexity(text)
lengths = []
for sentence in sent_tokenize(text):
lengths.append(len(word_tokenize(sentence)))
scores = lr_model.predict_proba([[perplexity.item(), np.mean(lengths), np.std(lengths), shapiro(lengths).pvalue if len(lengths) > 2 else 0.5]])[0]
return {'Human': scores[0], 'AI': scores[1]}, word_perplexities
sample_text = """
The Saturn V is a type of rocket that was developed by NASA in the 1960s to support the Apollo program, which aimed to land humans on the Moon.
It remains the most powerful rocket ever built, and its five F-1 engines generated more than 7.5 million pounds of thrust at liftoff.
The Saturn V was used for all of the Apollo missions to the Moon, as well as the launch of the Skylab space station.
Despite its impressive capabilities, the Saturn V was only used for a brief period of time before being retired in 1973.
Nevertheless, it remains a landmark achievement in the history of space exploration and a symbol of human ingenuity and determination."""
demo = gr.Interface(fn=score_text, inputs=[gr.Textbox(label="Text to score", lines=5, value=sample_text)], outputs=[gr.Label(), gr.HighlightedText()] )
demo.launch() |