import pickle import torch import numpy as np import gradio as gr from nltk import word_tokenize, sent_tokenize import nltk from scipy.stats import shapiro, percentileofscore from transformers import GPT2LMHeadModel, GPT2TokenizerFast nltk.download('punkt') model = GPT2LMHeadModel.from_pretrained('gpt2-large') tokenizer: GPT2TokenizerFast = GPT2TokenizerFast.from_pretrained('gpt2-large') with open('model.pkl', 'rb') as f: lr_model = pickle.load(f) with open('data.pkl', 'rb') as f: data = pickle.load(f) def get_perplexity(text: str): tokens = tokenizer(text, return_tensors='pt', truncation=True, return_offsets_mapping=True) inputs = tokens.input_ids targets = inputs.clone() with torch.no_grad(): outputs = model(inputs, labels=targets) labels = targets.to(outputs.logits.device) # Shift so that tokens < n predict n shift_logits = outputs.logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() perplexities = torch.nn.functional.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduce=False) output = [] targets = targets.to('cpu')[0].tolist() offsets = tokens.offset_mapping[0].tolist() perplexities = perplexities.to('cpu').numpy() perplexities = perplexities / np.max(perplexities) perplexities = perplexities.tolist() output.append((text[:tokens.word_to_chars(0)[1]], 0)) for word_id, p in zip(tokens.word_ids()[1:], perplexities): if word_id == len(output): span = tokens.word_to_chars(word_id) output.append((text[span[0]:span[1]], p)) return outputs.loss, output def score_text(text): perplexity, word_perplexities = get_perplexity(text) lengths = [] for sentence in sent_tokenize(text): lengths.append(len(word_tokenize(sentence))) pp = perplexity.item() length = np.mean(lengths) std_lengths = np.std(lengths) predictability = shapiro(lengths).pvalue if len(lengths) > 2 else 0.5 scores = lr_model.predict_proba([[pp, length, std_lengths, predictability]])[0] pp_percentile = percentileofscore(data[:,0], pp) length_percentile = percentileofscore(data[:,1], length) std_percentile = percentileofscore(data[:,2], std_lengths) predictability_percentile = percentileofscore(data[:,3], predictability) print(f'Perplexity: {pp_percentile}%, Length: {length_percentile}%, Std: {std_percentile}%, Predictability: {predictability_percentile}%') return {'Human': scores[0], 'AI': scores[1]}, {'Perplexity': pp_percentile / 100, 'Sentence Length': length_percentile / 100, 'Length Variation': std_percentile / 100, 'Length Normality': predictability_percentile / 100}, word_perplexities sample_1 = """The Saturn V is a type of rocket that was developed by NASA in the 1960s to support the Apollo program, which aimed to land humans on the Moon. It remains the most powerful rocket ever built, and its five F-1 engines generated more than 7.5 million pounds of thrust at liftoff. The Saturn V was used for all of the Apollo missions to the Moon, as well as the launch of the Skylab space station. Despite its impressive capabilities, the Saturn V was only used for a brief period of time before being retired in 1973. Nevertheless, it remains a landmark achievement in the history of space exploration and a symbol of human ingenuity and determination.""" sample_2 = """Saturn V[a] is a retired American super heavy-lift launch vehicle developed by NASA under the Apollo program for human exploration of the Moon. The rocket was human-rated, with three stages, and powered with liquid fuel. It was flown from 1967 to 1973. It was used for nine crewed flights to the Moon, and to launch Skylab, the first American space station. As of 2023, the Saturn V remains the only launch vehicle to carry humans beyond low Earth orbit (LEO). Saturn V holds records for the heaviest payload launched and largest payload capacity to low Earth orbit: 310,000 lb (140,000 kg), which included the third stage and unburned propellant needed to send the Apollo command and service module and Lunar Module to the Moon. The largest production model of the Saturn family of rockets, the Saturn V was designed under the direction of Wernher von Braun at the Marshall Space Flight Center in Huntsville, Alabama; the lead contractors were Boeing, North American Aviation, Douglas Aircraft Company, and IBM. A total of 15 flight-capable vehicles were built, plus three for ground testing. Thirteen were launched from Kennedy Space Center with no loss of crew or payload. A total of 24 astronauts were launched to the Moon from Apollo 8 (December 1968) to Apollo 17 (December 1972).""" sample_3 = """“The Signora had no business to do it,” said Miss Bartlett, “no business at all. She promised us south rooms with a view close together, instead of which here are north rooms, looking into a courtyard, and a long way apart. Oh, Lucy!” “And a Cockney, besides!” said Lucy, who had been further saddened by the Signora’s unexpected accent. “It might be London.” She looked at the two rows of English people who were sitting at the table; at the row of white bottles of water and red bottles of wine that ran between the English people; at the portraits of the late Queen and the late Poet Laureate that hung behind the English people, heavily framed; at the notice of the English church (Rev. Cuthbert Eager, M. A. Oxon.), that was the only other decoration of the wall. “Charlotte, don’t you feel, too, that we might be in London? I can hardly believe that all kinds of other things are just outside. I suppose it is one’s being so tired.” “This meat has surely been used for soup,” said Miss Bartlett, laying down her fork. “I want so to see the Arno. The rooms the Signora promised us in her letter would have looked over the Arno. The Signora had no business to do it at all. Oh, it is a shame!” “Any nook does for me,” Miss Bartlett continued; “but it does seem hard that you shouldn’t have a view.” Lucy felt that she had been selfish. “Charlotte, you mustn’t spoil me: of course, you must look over the Arno, too. I meant that. The first vacant room in the front—” “You must have it,” said Miss Bartlett, part of whose travelling expenses were paid by Lucy’s mother—a piece of generosity to which she made many a tactful allusion.""" sample_4 = """Miss Bartlett looked at Lucy with a mixture of disapproval and concern. She had hoped that this trip to Italy would broaden Lucy’s horizons and introduce her to a world beyond their sheltered English existence. But it seemed that Lucy was not quite ready to embrace the differences that came with travel. “Don’t be absurd, Lucy,” Miss Bartlett said, “how could we be in London? Look outside and see the sunshine, the olive groves, and the mountains. This is Italy, and it is a completely different experience than what we are used to.” Lucy sighed and looked out of the window. Miss Bartlett was right, of course. The view was stunning, and the warm Italian breeze was a welcome change from the damp English weather. But the Signora’s deception had put a damper on their arrival, and Lucy couldn’t help feeling disappointed. Just then, a young man walked into the dining room and greeted the English guests with a friendly smile. He was tall and handsome, with dark hair and sparkling eyes. Lucy felt a flutter in her chest as he approached their table. “Buongiorno,” he said, “my name is George Emerson. I couldn’t help but notice that you were disappointed with your rooms. If you’d like, I could switch with you. My mother and I are in south rooms, and we’d be happy to take the north ones.”""" description = """This Space can be used to measure the likelihood of a text being generated by an LLM like ChatGPT. In general, human written text has higher perplexity, sentence length, and length variation than AI generated text, with lower length normality. Perplexity is a measure of how often uncommon words appear in the text.""" demo = gr.Interface(fn=score_text, inputs=[gr.Textbox(label="Text to score", lines=5)], outputs=[gr.Label(label="Result"), gr.Label(label="Feature Scores (higher for humans)", show_label=False), gr.HighlightedText(label="Perplexities")], title="LLM Text Detector", description=description, examples=[[sample_1], [sample_2], [sample_3], [sample_4]]) demo.launch()