Spaces:

Neha13
/

AI_Content_Detector

Sleeping

File size: 3,274 Bytes

285d2df

import streamlit as st
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import nltk
from nltk.util import ngrams
from nltk.probability import FreqDist
import plotly.express as px
import torch.nn.functional as F
from collections import Counter
from nltk.corpus import stopwords
import string

import nltk
nltk.download('punkt')
nltk.download('stopwords')
# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

def c_perplexity(text):
    """Calculate the perplexity of the given text using GPT-2."""
    if not text.strip():
        return float('inf')  # Return inf for empty input
    
    input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors='pt')
    if input_ids.size(1) == 0:  # Check for empty input after encoding
        return float('inf')

    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    loss = F.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1))
    perplexity = torch.exp(loss)
    return perplexity.item()

def c_burstiness(text):
    """Calculate the burstiness of the given text."""
    tokens = nltk.word_tokenize(text.lower())
    if not tokens:
        return 0.0

    word_freq = FreqDist(tokens)
    repeated_count = sum(count > 1 for count in word_freq.values())
    b_score = repeated_count / len(word_freq) if len(word_freq) > 0 else 0.0
    return b_score

def top_repword_count(text):
    """Generate a bar chart of the top 10 most repeated words."""
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    
    word_counts = Counter(tokens)
    top_words = word_counts.most_common(10)
    
    if not top_words:
        st.write("No significant words found.")
        return
    
    words, counts = zip(*top_words)
    fig = px.bar(x=words, y=counts, labels={'x': 'Words', 'y': 'Counts'}, title="Top 10 Most Repeated Words in the Text")
    st.plotly_chart(fig, user_container_width=True)

# Streamlit app configuration
st.set_page_config(layout="wide")

st.title("AI Content Detector")

text_area = st.text_area("Enter your text here!")

if text_area:
    if st.button("Analyse the content"):
        col1, col2, col3 = st.columns([1, 2, 1])
        
        with col1:
            st.info("Your input text")
            st.success(text_area)
            
        with col2:
            st.info("Your output score")
            perplexity = c_perplexity(text_area)
            burstiness = c_burstiness(text_area)
            
            st.success(f"Perplexity score: {perplexity}")
            st.success(f"Burstiness score: {burstiness}")
            
            if perplexity > 40000 or burstiness < 0.24:
                st.error("Result: The text is likely AI-generated.")
            else:
                st.success("Result: The text is not AI-generated.")
        
            st.warning("Disclaimer: AI plagiarism detector apps can assist in identifying potential instances of plagiarism.")
        
        with col3:
            st.info("Basic Review")
            top_repword_count(text_area)