Spaces:
Running
Running
File size: 3,327 Bytes
1c38e8c 1360051 4457e86 aab722c a9d8b80 1360051 ce337fb f973f9e ce337fb bcd23af 375ed47 4457e86 1360051 ce337fb ed60e85 1360051 ce337fb 1360051 a9d8b80 f89536d 1360051 6adf923 c0a3abb a9d8b80 1360051 ce337fb 1360051 c0a3abb 1360051 ce337fb 1360051 ce337fb f973f9e 1360051 6adf923 1360051 6adf923 bcd23af ce337fb 6adf923 f973f9e ed60e85 1360051 ce337fb 1360051 f973f9e ce337fb f89536d ed60e85 f89536d bcd23af f89536d ed60e85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import streamlit as st
import tensorflow as tf
import numpy as np
import nltk
import os
from nltk.tokenize import sent_tokenize
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
# ๐ Hugging Face cache dir
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
# ๐ฅ Download NLTK punkt tokenizer
nltk_data_path = "/tmp/nltk_data"
nltk.download("punkt_tab", download_dir=nltk_data_path)
nltk.data.path.append(nltk_data_path)
# โ
Cache the model/tokenizer
@st.cache_resource
def load_model_and_tokenizer():
tokenizer = DistilBertTokenizerFast.from_pretrained(
"distilbert-base-uncased", cache_dir="/tmp/huggingface"
)
model = TFDistilBertForSequenceClassification.from_pretrained(
"sundaram07/distilbert-sentence-classifier", cache_dir="/tmp/huggingface"
)
return tokenizer, model
tokenizer, model = load_model_and_tokenizer()
# ๐ฎ Predict sentence AI probability
def predict_sentence_ai_probability(sentence):
inputs = tokenizer(sentence, return_tensors="tf", truncation=True, padding=True)
outputs = model(inputs)
logits = outputs.logits
prob_ai = tf.sigmoid(logits)[0][0].numpy()
return prob_ai
# ๐ Analyze text
def predict_ai_generated_percentage(text, threshold=0.15):
text = text.strip()
sentences = sent_tokenize(text)
if len(sentences) == 0:
return 0.0, []
ai_sentence_count = 0
results = []
for sentence in sentences:
prob = predict_sentence_ai_probability(sentence)
is_ai = prob <= threshold
results.append((sentence, prob, is_ai))
if is_ai:
ai_sentence_count += 1
ai_percentage = (ai_sentence_count / len(sentences)) * 100
return ai_percentage, results
# ๐ฅ๏ธ Streamlit UI
st.set_page_config(page_title="AI Detector", layout="wide")
st.title("๐ง AI Content Detector")
st.markdown("This app detects the percentage of **AI-generated content** using sentence-level analysis with DistilBERT.")
# ๐ Text input
user_input = st.text_area("๐ Paste your text below to check for AI-generated sentences:", height=300)
# ๐ Analyze button logic
if st.button("๐ Analyze"):
# Clear previous session results
st.session_state.analysis_done = False
st.session_state.analysis_results = None
st.session_state.ai_percentage = None
if not user_input.strip():
st.warning("โ ๏ธ Please enter some text.")
else:
# Perform analysis
ai_percentage, analysis_results = predict_ai_generated_percentage(user_input)
if len(analysis_results) == 0:
st.warning("โ ๏ธ Not enough valid sentences to analyze.")
else:
st.session_state.analysis_done = True
st.session_state.analysis_results = analysis_results
st.session_state.ai_percentage = ai_percentage
# ๐ค Show results
if st.session_state.get("analysis_done", False):
st.subheader("๐ Sentence-level Analysis")
for i, (sentence, prob, is_ai) in enumerate(st.session_state.analysis_results, start=1):
label = "๐ข Human" if not is_ai else "๐ด AI"
st.markdown(f"**{i}.** _{sentence}_\n\n โ {label}")
st.subheader("๐ Final Result")
st.success(f"Estimated **AI-generated content**: **{st.session_state.ai_percentage:.2f}%**")
|