import gradio as gr import spacy import re import numpy as np import torch from transformers import AutoTokenizer, AutoModelForCausalLM # Define some constants for error messages MAX_TEXT_LENGTH = 2048 MIN_GEN_LENGTH = 1 MAX_GEN_LENGTH = 2048 MIN_AI_PERCENTAGE = 50 # Download the Spacy model for English spacy.cli.download("en_core_web_sm") nlp = spacy.load("en_core_web_sm") # Define a function to detect AI-generated content and calculate the perplexity score, # burstiness score, and average perplexity score for a given text input def detect_ai_content(text, max_gen_length, temperature, model_name, model_size, num_return_sequences, min_ai_percentage): # Clean the text by removing extra spaces, line breaks, and special characters cleaned_text = re.sub(r'\s+', ' ', text).strip() cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text) # If the cleaned text is empty or contains only one sentence, return an error message doc = nlp(cleaned_text) if not re.search('\S', cleaned_text) or len(list(doc.sents)) < 2: return {"error": "Input text must contain at least two sentences."} # Check if the cleaned text is longer than the maximum allowed by the GPT model if len(cleaned_text) > MAX_TEXT_LENGTH: return {"error": f"Input text must be no longer than {MAX_TEXT_LENGTH} characters."} # Check if the minimum threshold for the percentage of AI-generated content is within the allowed range if not (0 <= min_ai_percentage <= 100): return {"error": "Minimum threshold for AI percentage must be between 0 and 100."} # Load the specified GPT model and tokenizer model_name = f"{model_name}-{model_size}" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Set the device to run the model on (either "cuda" or "cpu") device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # Set the end of sequence token ID for text generation eos_token_id = tokenizer.eos_token_id # Generate multiple sequences using the pre-trained GPT model input_ids = tokenizer.encode(cleaned_text, add_special_tokens=True, return_tensors='pt').to(device) output_sequences = [] for i in range(num_return_sequences): output_sequence = model.generate( input_ids=input_ids, max_length=max_gen_length + len(input_ids[0]), temperature=temperature, top_k=50, top_p=0.95, repetition_penalty=1.5, do_sample=True, num_return_sequences=1 )[0] output_sequences.append(output_sequence) # Decode the generated sequences using the GPT tokenizer generated_texts = [] perplexities = [] ai_percentages = [] for i, output_sequence in enumerate(output_sequences): generated_text = tokenizer.decode(output_sequence.tolist()[len(input_ids[0]):], skip_special_tokens=True) # Calculate the percentage of AI-generated content in the generated text ai_percentage = round(len(generated_text) / len(cleaned_text) * 100, 2) ai_percentages.append(ai_percentage) # Check if the AI percentage and perplexity score are above their respective thresholds generated_input_ids = tokenizer.encode(generated_text, add_special_tokens=False, return_tensors='pt').to(device) with torch.no_grad(): loss = model(generated_input_ids, labels=generated_input_ids).loss.item() perplexity = np.exp(loss) perplexities.append(perplexity) generated_texts.append(generated_text) # Remove the generated sequences that are identical or highly similar to the cleaned text clean_doc = nlp(cleaned_text) unique_generated_texts = [] for generated_text in generated_texts: gen_doc = nlp(generated_text) similarity = clean_doc.similarity(gen_doc) if similarity < 0.8: is_unique = True for unique_text in unique_generated_texts: unique_doc = nlp(unique_text) unique_similarity = unique_doc.similarity(gen_doc) if unique_similarity >= 0.8: is_unique = False break if is_unique: unique_generated_texts.append(generated_text) # Calculate the burstiness score for the input text, which measures the diversity of vocabulary in the input text doc = nlp(cleaned_text) # Extract the tokens from the input text's sentences all_tokens = [] for sent in doc.sents: tokens = [token.text.lower() for token in sent if not token.is_punct and not token.is_stop] all_tokens += tokens # Calculate the burstiness score for the input text unique_tokens = set(all_tokens) num_unique_tokens = len(unique_tokens) total_tokens = len(all_tokens) burstiness_score = (num_unique_tokens * num_unique_tokens) / (total_tokens * total_tokens) # Calculate the average perplexity score and AI percentage for the generated texts avg_perplexity = np.mean(perplexities) avg_ai_percentage = round(np.mean(ai_percentages), 2) # Check if the AI percentage and perplexity score are above their respective thresholds if avg_ai_percentage < min_ai_percentage: return {"error": f"The generated text has an AI percentage of {avg_ai_percentage}%, which is below the minimum threshold of {min_ai_percentage}%."} if avg_perplexity < MIN_GEN_LENGTH: return {"error": f"The generated text has a perplexity score of {avg_perplexity}, which is below the minimum threshold of {MIN_GEN_LENGTH}."} if avg_perplexity > MAX_GEN_LENGTH: return {"error": f"The generated text has a perplexity score of {avg_perplexity}, which is above the maximum threshold of {MAX_GEN_LENGTH}."} # Return the unique generated texts, burstiness score, and average AI percentage and perplexity score return { "generated_texts": unique_generated_texts, "burstiness_score": round(burstiness_score, 2), "avg_ai_percentage": avg_ai_percentage, "avg_perplexity": round(avg_perplexity, 2) } # Define the Gradio interface for the BAI Chat function def bai_chat(text, max_gen_length=256, temperature=0.7, model_name="gpt2", model_size="medium", num_return_sequences=5, min_ai_percentage=50): result = detect_ai_content(text, max_gen_length, temperature, model_name, model_size, num_return_sequences, min_ai_percentage) if "error" in result: return result["error"] else: generated_texts = "\n\n".join(result["generated_texts"]) return f"Burstiness Score: {result['burstiness_score']}\nAverage AI Percentage: {result['avg_ai_percentage']}%\nAverage Perplexity Score: {result['avg_perplexity']}\n\n{generated_texts}" gr.Interface(fn=bai_chat, inputs=[gr.inputs.Textbox("Enter your text here...")], outputs=[gr.outputs.Textbox(label="Generated Texts")]).launch()