Spaces:

abdullahmeda
/

detect-ai-text

Running

App Files Files Community

detect-ai-text / app.py

abdullahmeda

Update app.py

b3c492a verified over 1 year ago

raw

history blame contribute delete

8.27 kB

	import torch
	import joblib

	import numpy as np
	import pandas as pd
	import gradio as gr

	from nltk.data import load as nltk_load
	from transformers import AutoTokenizer, AutoModelForCausalLM


	print("Loading model & Tokenizer...")
	model_id = 'gpt2'
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id)

	print("Loading NLTL & and scikit-learn model...")
	NLTK = nltk_load('data/english.pickle')
	sent_cut_en = NLTK.tokenize
	clf = joblib.load(f'data/gpt2-small-model')

	CROSS_ENTROPY = torch.nn.CrossEntropyLoss(reduction='none')

	example = """\
	The perplexity (PPL) is commonly used as a metric for evaluating the performance of language models (LM). It is defined as the \
	exponential of the negative average log-likelihood of the text under the LM. A lower PPL indicates that the language model is more confident \
	in its predictions, and is therefore considered to be a better model. The training of LMs is carried out on large-scale text corpora, it can \
	be considered that it has learned some common language patterns and text structures. Therefore, PPL can be used to measure how well a text \
	conforms to common characteristics.

	I used all variants of the open-source GPT-2 model except xl size to compute the PPL (both text-level and sentence-level PPLs) of the collected \
	texts. It is observed that, regardless of whether it is at the text level or the sentence level, the content generated by LLMs have relatively \
	lower PPLs compared to the text written by humans. LLM captured common patterns and structures in the text it was trained on, and is very good at \
	reproducing them. As a result, text generated by LLMs have relatively concentrated low PPLs.\
	"""


	def gpt2_features(text, tokenizer, model, sent_cut):
	# Tokenize
	input_max_length = tokenizer.model_max_length - 2
	token_ids, offsets = list(), list()
	sentences = sent_cut(text)
	for s in sentences:
	tokens = tokenizer.tokenize(s)
	ids = tokenizer.convert_tokens_to_ids(tokens)
	difference = len(token_ids) + len(ids) - input_max_length
	if difference > 0:
	ids = ids[:-difference]
	offsets.append((len(token_ids), len(token_ids) + len(ids)))
	token_ids.extend(ids)
	if difference >= 0:
	break

	input_ids = torch.tensor([tokenizer.bos_token_id] + token_ids)
	logits = model(input_ids).logits
	# Shift so that n-1 predict n
	shift_logits = logits[:-1].contiguous()
	shift_target = input_ids[1:].contiguous()
	loss = CROSS_ENTROPY(shift_logits, shift_target)

	all_probs = torch.softmax(shift_logits, dim=-1)
	sorted_ids = torch.argsort(all_probs, dim=-1, descending=True) # stable=True
	expanded_tokens = shift_target.unsqueeze(-1).expand_as(sorted_ids)
	indices = torch.where(sorted_ids == expanded_tokens)
	rank = indices[-1]
	counter = [
	rank < 10,
	(rank >= 10) & (rank < 100),
	(rank >= 100) & (rank < 1000),
	rank >= 1000
	]
	counter = [c.long().sum(-1).item() for c in counter]


	# compute different-level ppl
	text_ppl = loss.mean().exp().item()
	sent_ppl = list()
	for start, end in offsets:
	nll = loss[start: end].sum() / (end - start)
	sent_ppl.append(nll.exp().item())
	max_sent_ppl = max(sent_ppl)
	sent_ppl_avg = sum(sent_ppl) / len(sent_ppl)
	if len(sent_ppl) > 1:
	sent_ppl_std = torch.std(torch.tensor(sent_ppl)).item()
	else:
	sent_ppl_std = 0

	mask = torch.tensor([1] * loss.size(0))
	step_ppl = loss.cumsum(dim=-1).div(mask.cumsum(dim=-1)).exp()
	max_step_ppl = step_ppl.max(dim=-1)[0].item()
	step_ppl_avg = step_ppl.sum(dim=-1).div(loss.size(0)).item()
	if step_ppl.size(0) > 1:
	step_ppl_std = step_ppl.std().item()
	else:
	step_ppl_std = 0
	ppls = [
	text_ppl, max_sent_ppl, sent_ppl_avg, sent_ppl_std,
	max_step_ppl, step_ppl_avg, step_ppl_std
	]
	return ppls + counter # type: ignore


	def predict_out(features, classifier, id_to_label):
	x = np.asarray([features])
	pred = classifier.predict(x)[0]
	prob = classifier.predict_proba(x)[0, pred]
	return [id_to_label[pred], prob]


	def predict(text):
	with torch.no_grad():
	feats = gpt2_features(text, tokenizer, model, sent_cut_en)
	out = predict_out(feats, clf, ['Human Written', 'LLM Generated'])
	return out


	with gr.Blocks() as demo:
	gr.Markdown(
	"""\
	## Detect text generated using LLMs 🤖

	Linguistic features such as Perplexity and other SOTA methods such as GLTR were used to classify between Human written and LLM Generated \
	texts. This solution scored an ROC of 0.956 and 8th position in the DAIGT LLM Competition on Kaggle.

	- Source & Credits: [https://github.com/Hello-SimpleAI/chatgpt-comparison-detection](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)
	- Competition: [https://www.kaggle.com/competitions/llm-detect-ai-generated-text/leaderboard](https://www.kaggle.com/competitions/llm-detect-ai-generated-text/leaderboard)
	- Solution WriteUp: [https://www.kaggle.com/competitions/llm-detect-ai-generated-text/discussion/470224](https://www.kaggle.com/competitions/llm-detect-ai-generated-text/discussion/470224)\
	"""
	)
	with gr.Row():
	gr.Markdown(
	"""\
	### Linguistic Analysis: Language Model Perplexity
	The perplexity (PPL) is commonly used as a metric for evaluating the performance of language models (LM). It is defined as the exponential \
	of the negative average log-likelihood of the text under the LM. A lower PPL indicates that the language model is more confident in its \
	predictions, and is therefore considered to be a better model. The training of LMs is carried out on large-scale text corpora, it can \
	be considered that it has learned some common language patterns and text structures. Therefore, PPL can be used to measure how \
	well a text conforms to common characteristics.

	I used all variants of the open-source GPT-2 model except xl size to compute the PPL (both text-level and sentence-level PPLs) of the \
	collected texts. It is observed that, regardless of whether it is at the text level or the sentence level, the content generated by LLMs \
	have relatively lower PPLs compared to the text written by humans. LLM captured common patterns and structures in the text it was trained on, \
	and is very good at reproducing them. As a result, text generated by LLMs have relatively concentrated low PPLs.

	Humans have the ability to express themselves in a wide variety of ways, depending on the context, audience, and purpose of the text they are \
	writing. This can include using creative or imaginative elements, such as metaphors, similes, and unique word choices, which can make it more \
	difficult for GPT2 to predict.

	### GLTR: Giant Language Model Test Room
	This idea originates from the following paper: arxiv.org/pdf/1906.04043.pdf. It studies 3 tests to compute features of an input text. Their \
	major assumption is that to generate fluent and natural-looking text, most decoding strategies sample high probability tokens from the head \
	of the distribution. I selected the most powerful Test-2 feature, which is the number of tokens in the Top-10, Top-100, Top-1000, and 1000+ \
	ranks from the LM predicted probability distributions.

	### Modelling
	Scikit-learn's VotingClassifier consisting of XGBClassifier, LGBMClassifier, CatBoostClassifier and RandomForestClassifier with default parameters\
	"""
	)
	with gr.Column():
	a1 = gr.Textbox( lines=7, label='Text', value=example )
	button1 = gr.Button("🤖 Predict!")
	gr.Markdown("Prediction:")
	label1 = gr.Textbox(lines=1, label='Predicted Label')
	score1 = gr.Textbox(lines=1, label='Predicted Probability')

	button1.click(predict, inputs=[a1], outputs=[label1, score1])

	demo.launch()