File size: 3,274 Bytes
285d2df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import streamlit as st
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import nltk
from nltk.util import ngrams
from nltk.probability import FreqDist
import plotly.express as px
import torch.nn.functional as F
from collections import Counter
from nltk.corpus import stopwords
import string

import nltk
nltk.download('punkt')
nltk.download('stopwords')
# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

def c_perplexity(text):
    """Calculate the perplexity of the given text using GPT-2."""
    if not text.strip():
        return float('inf')  # Return inf for empty input
    
    input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors='pt')
    if input_ids.size(1) == 0:  # Check for empty input after encoding
        return float('inf')

    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    loss = F.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1))
    perplexity = torch.exp(loss)
    return perplexity.item()

def c_burstiness(text):
    """Calculate the burstiness of the given text."""
    tokens = nltk.word_tokenize(text.lower())
    if not tokens:
        return 0.0

    word_freq = FreqDist(tokens)
    repeated_count = sum(count > 1 for count in word_freq.values())
    b_score = repeated_count / len(word_freq) if len(word_freq) > 0 else 0.0
    return b_score

def top_repword_count(text):
    """Generate a bar chart of the top 10 most repeated words."""
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    
    word_counts = Counter(tokens)
    top_words = word_counts.most_common(10)
    
    if not top_words:
        st.write("No significant words found.")
        return
    
    words, counts = zip(*top_words)
    fig = px.bar(x=words, y=counts, labels={'x': 'Words', 'y': 'Counts'}, title="Top 10 Most Repeated Words in the Text")
    st.plotly_chart(fig, user_container_width=True)

# Streamlit app configuration
st.set_page_config(layout="wide")

st.title("AI Content Detector")

text_area = st.text_area("Enter your text here!")

if text_area:
    if st.button("Analyse the content"):
        col1, col2, col3 = st.columns([1, 2, 1])
        
        with col1:
            st.info("Your input text")
            st.success(text_area)
            
        with col2:
            st.info("Your output score")
            perplexity = c_perplexity(text_area)
            burstiness = c_burstiness(text_area)
            
            st.success(f"Perplexity score: {perplexity}")
            st.success(f"Burstiness score: {burstiness}")
            
            if perplexity > 40000 or burstiness < 0.24:
                st.error("Result: The text is likely AI-generated.")
            else:
                st.success("Result: The text is not AI-generated.")
        
            st.warning("Disclaimer: AI plagiarism detector apps can assist in identifying potential instances of plagiarism.")
        
        with col3:
            st.info("Basic Review")
            top_repword_count(text_area)