detect-ai-text / app.py
abdullahmeda's picture
Update app.py
b3c492a verified
raw
history blame contribute delete
No virus
8.27 kB
import torch
import joblib
import numpy as np
import pandas as pd
import gradio as gr
from nltk.data import load as nltk_load
from transformers import AutoTokenizer, AutoModelForCausalLM
print("Loading model & Tokenizer...")
model_id = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
print("Loading NLTL & and scikit-learn model...")
NLTK = nltk_load('data/english.pickle')
sent_cut_en = NLTK.tokenize
clf = joblib.load(f'data/gpt2-small-model')
CROSS_ENTROPY = torch.nn.CrossEntropyLoss(reduction='none')
example = """\
The perplexity (PPL) is commonly used as a metric for evaluating the performance of language models (LM). It is defined as the \
exponential of the negative average log-likelihood of the text under the LM. A lower PPL indicates that the language model is more confident \
in its predictions, and is therefore considered to be a better model. The training of LMs is carried out on large-scale text corpora, it can \
be considered that it has learned some common language patterns and text structures. Therefore, PPL can be used to measure how well a text \
conforms to common characteristics.
I used all variants of the open-source GPT-2 model except xl size to compute the PPL (both text-level and sentence-level PPLs) of the collected \
texts. It is observed that, regardless of whether it is at the text level or the sentence level, the content generated by LLMs have relatively \
lower PPLs compared to the text written by humans. LLM captured common patterns and structures in the text it was trained on, and is very good at \
reproducing them. As a result, text generated by LLMs have relatively concentrated low PPLs.\
"""
def gpt2_features(text, tokenizer, model, sent_cut):
# Tokenize
input_max_length = tokenizer.model_max_length - 2
token_ids, offsets = list(), list()
sentences = sent_cut(text)
for s in sentences:
tokens = tokenizer.tokenize(s)
ids = tokenizer.convert_tokens_to_ids(tokens)
difference = len(token_ids) + len(ids) - input_max_length
if difference > 0:
ids = ids[:-difference]
offsets.append((len(token_ids), len(token_ids) + len(ids)))
token_ids.extend(ids)
if difference >= 0:
break
input_ids = torch.tensor([tokenizer.bos_token_id] + token_ids)
logits = model(input_ids).logits
# Shift so that n-1 predict n
shift_logits = logits[:-1].contiguous()
shift_target = input_ids[1:].contiguous()
loss = CROSS_ENTROPY(shift_logits, shift_target)
all_probs = torch.softmax(shift_logits, dim=-1)
sorted_ids = torch.argsort(all_probs, dim=-1, descending=True) # stable=True
expanded_tokens = shift_target.unsqueeze(-1).expand_as(sorted_ids)
indices = torch.where(sorted_ids == expanded_tokens)
rank = indices[-1]
counter = [
rank < 10,
(rank >= 10) & (rank < 100),
(rank >= 100) & (rank < 1000),
rank >= 1000
]
counter = [c.long().sum(-1).item() for c in counter]
# compute different-level ppl
text_ppl = loss.mean().exp().item()
sent_ppl = list()
for start, end in offsets:
nll = loss[start: end].sum() / (end - start)
sent_ppl.append(nll.exp().item())
max_sent_ppl = max(sent_ppl)
sent_ppl_avg = sum(sent_ppl) / len(sent_ppl)
if len(sent_ppl) > 1:
sent_ppl_std = torch.std(torch.tensor(sent_ppl)).item()
else:
sent_ppl_std = 0
mask = torch.tensor([1] * loss.size(0))
step_ppl = loss.cumsum(dim=-1).div(mask.cumsum(dim=-1)).exp()
max_step_ppl = step_ppl.max(dim=-1)[0].item()
step_ppl_avg = step_ppl.sum(dim=-1).div(loss.size(0)).item()
if step_ppl.size(0) > 1:
step_ppl_std = step_ppl.std().item()
else:
step_ppl_std = 0
ppls = [
text_ppl, max_sent_ppl, sent_ppl_avg, sent_ppl_std,
max_step_ppl, step_ppl_avg, step_ppl_std
]
return ppls + counter # type: ignore
def predict_out(features, classifier, id_to_label):
x = np.asarray([features])
pred = classifier.predict(x)[0]
prob = classifier.predict_proba(x)[0, pred]
return [id_to_label[pred], prob]
def predict(text):
with torch.no_grad():
feats = gpt2_features(text, tokenizer, model, sent_cut_en)
out = predict_out(feats, clf, ['Human Written', 'LLM Generated'])
return out
with gr.Blocks() as demo:
gr.Markdown(
"""\
## Detect text generated using LLMs πŸ€–
Linguistic features such as Perplexity and other SOTA methods such as GLTR were used to classify between Human written and LLM Generated \
texts. This solution scored an ROC of 0.956 and 8th position in the DAIGT LLM Competition on Kaggle.
- Source & Credits: [https://github.com/Hello-SimpleAI/chatgpt-comparison-detection](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)
- Competition: [https://www.kaggle.com/competitions/llm-detect-ai-generated-text/leaderboard](https://www.kaggle.com/competitions/llm-detect-ai-generated-text/leaderboard)
- Solution WriteUp: [https://www.kaggle.com/competitions/llm-detect-ai-generated-text/discussion/470224](https://www.kaggle.com/competitions/llm-detect-ai-generated-text/discussion/470224)\
"""
)
with gr.Row():
gr.Markdown(
"""\
### Linguistic Analysis: Language Model Perplexity
The perplexity (PPL) is commonly used as a metric for evaluating the performance of language models (LM). It is defined as the exponential \
of the negative average log-likelihood of the text under the LM. A lower PPL indicates that the language model is more confident in its \
predictions, and is therefore considered to be a better model. The training of LMs is carried out on large-scale text corpora, it can \
be considered that it has learned some common language patterns and text structures. Therefore, PPL can be used to measure how \
well a text conforms to common characteristics.
I used all variants of the open-source GPT-2 model except xl size to compute the PPL (both text-level and sentence-level PPLs) of the \
collected texts. It is observed that, regardless of whether it is at the text level or the sentence level, the content generated by LLMs \
have relatively lower PPLs compared to the text written by humans. LLM captured common patterns and structures in the text it was trained on, \
and is very good at reproducing them. As a result, text generated by LLMs have relatively concentrated low PPLs.
Humans have the ability to express themselves in a wide variety of ways, depending on the context, audience, and purpose of the text they are \
writing. This can include using creative or imaginative elements, such as metaphors, similes, and unique word choices, which can make it more \
difficult for GPT2 to predict.
### GLTR: Giant Language Model Test Room
This idea originates from the following paper: arxiv.org/pdf/1906.04043.pdf. It studies 3 tests to compute features of an input text. Their \
major assumption is that to generate fluent and natural-looking text, most decoding strategies sample high probability tokens from the head \
of the distribution. I selected the most powerful Test-2 feature, which is the number of tokens in the Top-10, Top-100, Top-1000, and 1000+ \
ranks from the LM predicted probability distributions.
### Modelling
Scikit-learn's VotingClassifier consisting of XGBClassifier, LGBMClassifier, CatBoostClassifier and RandomForestClassifier with default parameters\
"""
)
with gr.Column():
a1 = gr.Textbox( lines=7, label='Text', value=example )
button1 = gr.Button("πŸ€– Predict!")
gr.Markdown("Prediction:")
label1 = gr.Textbox(lines=1, label='Predicted Label')
score1 = gr.Textbox(lines=1, label='Predicted Probability')
button1.click(predict, inputs=[a1], outputs=[label1, score1])
demo.launch()