Neha13's picture
Create app.py
285d2df verified
import streamlit as st
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import nltk
from nltk.util import ngrams
from nltk.probability import FreqDist
import plotly.express as px
import torch.nn.functional as F
from collections import Counter
from nltk.corpus import stopwords
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
def c_perplexity(text):
"""Calculate the perplexity of the given text using GPT-2."""
if not text.strip():
return float('inf') # Return inf for empty input
input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors='pt')
if input_ids.size(1) == 0: # Check for empty input after encoding
return float('inf')
with torch.no_grad():
outputs = model(input_ids)
logits = outputs.logits
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1))
perplexity = torch.exp(loss)
return perplexity.item()
def c_burstiness(text):
"""Calculate the burstiness of the given text."""
tokens = nltk.word_tokenize(text.lower())
if not tokens:
return 0.0
word_freq = FreqDist(tokens)
repeated_count = sum(count > 1 for count in word_freq.values())
b_score = repeated_count / len(word_freq) if len(word_freq) > 0 else 0.0
return b_score
def top_repword_count(text):
"""Generate a bar chart of the top 10 most repeated words."""
tokens = nltk.word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
word_counts = Counter(tokens)
top_words = word_counts.most_common(10)
if not top_words:
st.write("No significant words found.")
return
words, counts = zip(*top_words)
fig = px.bar(x=words, y=counts, labels={'x': 'Words', 'y': 'Counts'}, title="Top 10 Most Repeated Words in the Text")
st.plotly_chart(fig, user_container_width=True)
# Streamlit app configuration
st.set_page_config(layout="wide")
st.title("AI Content Detector")
text_area = st.text_area("Enter your text here!")
if text_area:
if st.button("Analyse the content"):
col1, col2, col3 = st.columns([1, 2, 1])
with col1:
st.info("Your input text")
st.success(text_area)
with col2:
st.info("Your output score")
perplexity = c_perplexity(text_area)
burstiness = c_burstiness(text_area)
st.success(f"Perplexity score: {perplexity}")
st.success(f"Burstiness score: {burstiness}")
if perplexity > 40000 or burstiness < 0.24:
st.error("Result: The text is likely AI-generated.")
else:
st.success("Result: The text is not AI-generated.")
st.warning("Disclaimer: AI plagiarism detector apps can assist in identifying potential instances of plagiarism.")
with col3:
st.info("Basic Review")
top_repword_count(text_area)