File size: 3,241 Bytes
cc5ee73
 
 
 
 
ba02478
cc5ee73
 
 
ba02478
 
884c12c
cc5ee73
 
 
 
 
 
 
884c12c
cc5ee73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pickle
import torch
import numpy as np
import gradio as gr
from nltk import word_tokenize, sent_tokenize
import nltk
from scipy.stats import shapiro
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

nltk.download('punkt')

model = GPT2LMHeadModel.from_pretrained('gpt2-large')
tokenizer: GPT2TokenizerFast = GPT2TokenizerFast.from_pretrained('gpt2-large')

with open('model.pkl', 'rb') as f:
    lr_model = pickle.load(f)

def get_perplexity(text: str):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, return_offsets_mapping=True)
    inputs = tokens.input_ids
    targets = inputs.clone()
    with torch.no_grad():
        outputs = model(inputs, labels=targets)
        labels = targets.to(outputs.logits.device)
        # Shift so that tokens < n predict n
        shift_logits = outputs.logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        perplexities = torch.nn.functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduce=False)
        output = []
        targets = targets.to('cpu')[0].tolist()
        # tokens = tokenizer.convert_ids_to_tokens(targets)
        offsets = tokens.offset_mapping[0].tolist()
        print(perplexities.to('cpu').tolist())
        perplexities = perplexities.to('cpu').numpy()
        perplexities = perplexities / np.max(perplexities)
        perplexities = perplexities.tolist()
        print(perplexities)
        # output.append((text[:offsets[0][1]], 0))
        # for offset, p in zip(offsets[1:], perplexities):
        #     output.append((text[offset[0]:offset[1]], p))
        #     print(type(p))
        output.append((text[:tokens.word_to_chars(0)[1]], 0))
        for word_id, p in zip(tokens.word_ids()[1:], perplexities):
            if word_id == len(output):
                span = tokens.word_to_chars(word_id)
                output.append((text[span[0]:span[1]], p))
        return outputs.loss, output



def score_text(text):
    perplexity, word_perplexities = get_perplexity(text)
    lengths = []
    for sentence in sent_tokenize(text):
        lengths.append(len(word_tokenize(sentence)))
    scores = lr_model.predict_proba([[perplexity.item(), np.mean(lengths), np.std(lengths), shapiro(lengths).pvalue if len(lengths) > 2 else 0.5]])[0]

    return {'Human': scores[0], 'AI': scores[1]}, word_perplexities

sample_text = """
The Saturn V is a type of rocket that was developed by NASA in the 1960s to support the Apollo program, which aimed to land humans on the Moon. 
It remains the most powerful rocket ever built, and its five F-1 engines generated more than 7.5 million pounds of thrust at liftoff. 
The Saturn V was used for all of the Apollo missions to the Moon, as well as the launch of the Skylab space station. 
Despite its impressive capabilities, the Saturn V was only used for a brief period of time before being retired in 1973. 
Nevertheless, it remains a landmark achievement in the history of space exploration and a symbol of human ingenuity and determination."""

demo = gr.Interface(fn=score_text, inputs=[gr.Textbox(label="Text to score", lines=5, value=sample_text)], outputs=[gr.Label(), gr.HighlightedText()] )

demo.launch()