import gradio as gr from transformers import pipeline, AutoTokenizer # --- MODEL LOADING --- # Load both the pipeline and the tokenizer for the model # The tokenizer is needed to split the text into chunks the model can understand. model_name = "openai-community/roberta-base-openai-detector" pipe = pipeline("text-classification", model=model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) def detect_ai_text(text): """ Analyzes input text, handling long texts by chunking them into smaller pieces. """ # Get the model's max length, subtracting a few tokens for safety margin. max_length = tokenizer.model_max_length - 2 # Tokenize the entire input text tokens = tokenizer.encode(text) # If the text is short enough, process it in one go. if len(tokens) <= max_length: results = pipe(text) return {item['label']: item['score'] for item in results} # --- CHUNKING LOGIC FOR LONG TEXT --- # If the text is too long, we process it in overlapping chunks. all_scores = [] # Create chunks with a 50-token overlap to maintain context between them for i in range(0, len(tokens), max_length - 50): chunk_tokens = tokens[i:i + max_length] # Decode the chunk tokens back to a string for the pipeline chunk_text = tokenizer.decode(chunk_tokens) # Run the model on the chunk chunk_results = pipe(chunk_text) # Find the score for the 'AI_GENERATED' label (LABEL_1) for item in chunk_results: if item['label'] == 'LABEL_1': # LABEL_1 is the AI score all_scores.append(item['score']) break # Move to the next chunk # If for some reason no scores were collected, return an error state. if not all_scores: return {"error": "Could not process text."} # Average the AI scores from all chunks to get a final score average_ai_score = sum(all_scores) / len(all_scores) # Return the aggregated result in the same format as a single run return { 'LABEL_1': average_ai_score, # AI score 'LABEL_0': 1 - average_ai_score, # Human score 'note': f'Result aggregated from {len(all_scores)} chunks.' } # --- GRADIO INTERFACE --- iface = gr.Interface( fn=detect_ai_text, inputs=gr.Textbox(lines=15, placeholder="Paste the text you want to analyze here..."), outputs="json", title="AI Content Detector (Robust Version)", description="This version handles long texts by breaking them into chunks. It analyzes text for AI generation using the roberta-base-openai-detector model." ) # Launch the app iface.launch()