Spaces:

DTabs
/

AI_score

Sleeping

App Files Files Community

DTabs commited on 30 days ago

Commit

e940277

verified ·

1 Parent(s): d593f8c

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -22

app.py CHANGED Viewed

@@ -1,32 +1,57 @@
 import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2TokenizerFast, GPT2LMHeadModel
 import math
 import gradio as gr
 # -----------------------------
-# Load models (only once)
 # -----------------------------
 detectors = {
     "roberta-large": AutoModelForSequenceClassification.from_pretrained("roberta-large-openai-detector"),
     "roberta-base": AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
 }
 tokenizers = {
     "roberta-large": AutoTokenizer.from_pretrained("roberta-large-openai-detector"),
     "roberta-base": AutoTokenizer.from_pretrained("roberta-base-openai-detector")
 }
 for model in detectors.values():
     model.eval()
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
 gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
 gpt2_model.eval()
-# -----------------------------
 # Scoring functions
-# -----------------------------
 def ai_score_roberta(text, model_name):
     tokenizer = tokenizers[model_name]
     model = detectors[model_name]
@@ -34,8 +59,7 @@ def ai_score_roberta(text, model_name):
     with torch.no_grad():
         outputs = model(**inputs)
         probs = F.softmax(outputs.logits, dim=1)
-        ai_prob = probs[0][1].item()
-    return ai_prob
 def ai_score_perplexity(text):
     encodings = gpt2_tokenizer(text, return_tensors="pt")
@@ -43,8 +67,7 @@ def ai_score_perplexity(text):
         outputs = gpt2_model(**encodings, labels=encodings["input_ids"])
     loss = outputs.loss
     ppl = math.exp(loss.item())
-    score = 1.0 / (1.0 + ppl)
-    return score
 def robust_ai_score(text, weights={"large":0.4, "base":0.4, "ppl":0.2}, threshold_adjust=0.95):
     score_large = ai_score_roberta(text, "roberta-large")
@@ -68,19 +91,88 @@ def robust_ai_score(text, weights={"large":0.4, "base":0.4, "ppl":0.2}, threshol
     }
 # -----------------------------
-# Gradio Interface
 # -----------------------------
-def detect_ai(text):
-    result = robust_ai_score(text)
-    return result
-iface = gr.Interface(
-    fn=detect_ai,
-    inputs=gr.Textbox(lines=5, label="Enter text to analyze"),
-    outputs=gr.JSON(label="AI Detection Result"),
-    title="AI Detection API (Roberta + GPT-2)",
-    description="This tool detects whether text is AI-generated using Roberta and GPT-2 models."
-)
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

+# main.py
 import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2TokenizerFast, GPT2LMHeadModel
 import math
 import gradio as gr
+from sentence_transformers import SentenceTransformer, util
+from googlesearch import search
+from ddgs import DDGS
+from bs4 import BeautifulSoup
+import httpx
+import re, os
+import numpy as np
+import asyncio
+import logging
+import nltk
 # -----------------------------
+# Setup logging
 # -----------------------------
+logging.basicConfig(level=logging.INFO)
+logging.getLogger("transformers").setLevel(logging.ERROR)
+logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
+# -----------------------------
+# Download nltk punkt
+# -----------------------------
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+# -----------------------------
+# -----------------------------
+# 1️⃣ AI DETECTOR SETUP
+# -----------------------------
+# Load Roberta models
 detectors = {
     "roberta-large": AutoModelForSequenceClassification.from_pretrained("roberta-large-openai-detector"),
     "roberta-base": AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
 }
 tokenizers = {
     "roberta-large": AutoTokenizer.from_pretrained("roberta-large-openai-detector"),
     "roberta-base": AutoTokenizer.from_pretrained("roberta-base-openai-detector")
 }
 for model in detectors.values():
     model.eval()
+# Load GPT-2 for perplexity
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
 gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
 gpt2_model.eval()
 # Scoring functions
 def ai_score_roberta(text, model_name):
     tokenizer = tokenizers[model_name]
     model = detectors[model_name]
     with torch.no_grad():
         outputs = model(**inputs)
         probs = F.softmax(outputs.logits, dim=1)
+        return probs[0][1].item()
 def ai_score_perplexity(text):
     encodings = gpt2_tokenizer(text, return_tensors="pt")
         outputs = gpt2_model(**encodings, labels=encodings["input_ids"])
     loss = outputs.loss
     ppl = math.exp(loss.item())
+    return 1.0 / (1.0 + ppl)
 def robust_ai_score(text, weights={"large":0.4, "base":0.4, "ppl":0.2}, threshold_adjust=0.95):
     score_large = ai_score_roberta(text, "roberta-large")
     }
 # -----------------------------
 # -----------------------------
+# 2️⃣ PLAGIARISM CHECKER SETUP
+# -----------------------------
+# Load MiniLM
+plag_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Helper functions
+def clean_text(text):
+    return re.sub(r'\s+', ' ', text).strip()
+async def fetch_web_paragraphs(url):
+    try:
+        headers = {'User-Agent': 'Mozilla/5.0'}
+        async with httpx.AsyncClient() as client:
+            r = await client.get(url, headers=headers, timeout=10)
+            if r.status_code != 200:
+                logging.warning(f"Failed to fetch URL: {url}, status: {r.status_code}")
+                return []
+            soup = BeautifulSoup(r.text, 'html.parser')
+            return [clean_text(p.get_text()) for p in soup.find_all('p') if p.get_text().strip()]
+    except Exception as e:
+        logging.error(f"Error fetching {url}: {str(e)}")
+        return []
+async def get_search_urls(text, num_results=10):
+    urls = []
+    try:
+        urls = list(search(text, num_results=num_results, stop=num_results))
+    except Exception as e:
+        logging.warning(f"Google search failed: {str(e)}")
+    if len(urls) < num_results:
+        try:
+            with DDGS() as ddgs:
+                results = ddgs.text(text, max_results=num_results - len(urls))
+                urls += [r['href'] for r in results]
+        except Exception as e:
+            logging.warning(f"DuckDuckGo search failed: {str(e)}")
+    return urls
+def hybrid_similarity(text1, text2):
+    emb1 = plag_model.encode(text1, convert_to_tensor=True)
+    emb2 = plag_model.encode(text2, convert_to_tensor=True)
+    return util.pytorch_cos_sim(emb1, emb2).item()
+async def internet_plagiarism_score(input_text, num_results=10):
+    urls = await get_search_urls(input_text, num_results=num_results)
+    all_matches = []
+    for url in urls:
+        paragraphs = await fetch_web_paragraphs(url)
+        if not paragraphs:
+            continue
+        max_sim = max([hybrid_similarity(input_text, p) for p in paragraphs])
+        all_matches.append((url, max_sim))
+        await asyncio.sleep(0.5)
+    if not all_matches:
+        return {"score": 0, "matches": []}
+    top_matches = sorted(all_matches, key=lambda x: x[1], reverse=True)[:5]
+    avg_score = np.mean([sim for _, sim in top_matches])
+    return {
+        "score": round(avg_score * 100, 2),
+        "urls": [u for u, _ in top_matches]
+    }
+def check_plagiarism_sync(text):
+    return asyncio.run(internet_plagiarism_score(text))
+# -----------------------------
+# -----------------------------
+# 3️⃣ GRADIO UI
+# -----------------------------
+with gr.Blocks() as demo:
+    with gr.Tab("AI Detection"):
+        ai_input = gr.Textbox(lines=5, label="Enter text to analyze")
+        ai_output = gr.JSON(label="AI Detection Result")
+        ai_button = gr.Button("Analyze")
+        ai_button.click(fn=robust_ai_score, inputs=ai_input, outputs=ai_output)
+    with gr.Tab("Plagiarism Checker"):
+        plg_input = gr.Textbox(lines=5, label="Enter text to check plagiarism")
+        plg_output = gr.JSON(label="Plagiarism Result")
+        plg_button = gr.Button("Check Plagiarism")
+        plg_button.click(fn=check_plagiarism_sync, inputs=plg_input, outputs=plg_output)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)