DTabs commited on
Commit
e940277
·
verified ·
1 Parent(s): d593f8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -22
app.py CHANGED
@@ -1,32 +1,57 @@
 
1
  import torch
2
  import torch.nn.functional as F
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2TokenizerFast, GPT2LMHeadModel
4
  import math
5
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
6
 
7
  # -----------------------------
8
- # Load models (only once)
9
  # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  detectors = {
11
  "roberta-large": AutoModelForSequenceClassification.from_pretrained("roberta-large-openai-detector"),
12
  "roberta-base": AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
13
  }
14
-
15
  tokenizers = {
16
  "roberta-large": AutoTokenizer.from_pretrained("roberta-large-openai-detector"),
17
  "roberta-base": AutoTokenizer.from_pretrained("roberta-base-openai-detector")
18
  }
19
-
20
  for model in detectors.values():
21
  model.eval()
22
 
 
23
  gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
24
  gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
25
  gpt2_model.eval()
26
 
27
- # -----------------------------
28
  # Scoring functions
29
- # -----------------------------
30
  def ai_score_roberta(text, model_name):
31
  tokenizer = tokenizers[model_name]
32
  model = detectors[model_name]
@@ -34,8 +59,7 @@ def ai_score_roberta(text, model_name):
34
  with torch.no_grad():
35
  outputs = model(**inputs)
36
  probs = F.softmax(outputs.logits, dim=1)
37
- ai_prob = probs[0][1].item()
38
- return ai_prob
39
 
40
  def ai_score_perplexity(text):
41
  encodings = gpt2_tokenizer(text, return_tensors="pt")
@@ -43,8 +67,7 @@ def ai_score_perplexity(text):
43
  outputs = gpt2_model(**encodings, labels=encodings["input_ids"])
44
  loss = outputs.loss
45
  ppl = math.exp(loss.item())
46
- score = 1.0 / (1.0 + ppl)
47
- return score
48
 
49
  def robust_ai_score(text, weights={"large":0.4, "base":0.4, "ppl":0.2}, threshold_adjust=0.95):
50
  score_large = ai_score_roberta(text, "roberta-large")
@@ -68,19 +91,88 @@ def robust_ai_score(text, weights={"large":0.4, "base":0.4, "ppl":0.2}, threshol
68
  }
69
 
70
  # -----------------------------
71
- # Gradio Interface
72
  # -----------------------------
73
- def detect_ai(text):
74
- result = robust_ai_score(text)
75
- return result
76
-
77
- iface = gr.Interface(
78
- fn=detect_ai,
79
- inputs=gr.Textbox(lines=5, label="Enter text to analyze"),
80
- outputs=gr.JSON(label="AI Detection Result"),
81
- title="AI Detection API (Roberta + GPT-2)",
82
- description="This tool detects whether text is AI-generated using Roberta and GPT-2 models."
83
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  if __name__ == "__main__":
86
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ # main.py
2
  import torch
3
  import torch.nn.functional as F
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2TokenizerFast, GPT2LMHeadModel
5
  import math
6
  import gradio as gr
7
+ from sentence_transformers import SentenceTransformer, util
8
+ from googlesearch import search
9
+ from ddgs import DDGS
10
+ from bs4 import BeautifulSoup
11
+ import httpx
12
+ import re, os
13
+ import numpy as np
14
+ import asyncio
15
+ import logging
16
+ import nltk
17
 
18
  # -----------------------------
19
+ # Setup logging
20
  # -----------------------------
21
+ logging.basicConfig(level=logging.INFO)
22
+ logging.getLogger("transformers").setLevel(logging.ERROR)
23
+ logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
24
+
25
+ # -----------------------------
26
+ # Download nltk punkt
27
+ # -----------------------------
28
+ try:
29
+ nltk.data.find('tokenizers/punkt')
30
+ except LookupError:
31
+ nltk.download('punkt')
32
+
33
+ # -----------------------------
34
+ # -----------------------------
35
+ # 1️⃣ AI DETECTOR SETUP
36
+ # -----------------------------
37
+ # Load Roberta models
38
  detectors = {
39
  "roberta-large": AutoModelForSequenceClassification.from_pretrained("roberta-large-openai-detector"),
40
  "roberta-base": AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
41
  }
 
42
  tokenizers = {
43
  "roberta-large": AutoTokenizer.from_pretrained("roberta-large-openai-detector"),
44
  "roberta-base": AutoTokenizer.from_pretrained("roberta-base-openai-detector")
45
  }
 
46
  for model in detectors.values():
47
  model.eval()
48
 
49
+ # Load GPT-2 for perplexity
50
  gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
51
  gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
52
  gpt2_model.eval()
53
 
 
54
  # Scoring functions
 
55
  def ai_score_roberta(text, model_name):
56
  tokenizer = tokenizers[model_name]
57
  model = detectors[model_name]
 
59
  with torch.no_grad():
60
  outputs = model(**inputs)
61
  probs = F.softmax(outputs.logits, dim=1)
62
+ return probs[0][1].item()
 
63
 
64
  def ai_score_perplexity(text):
65
  encodings = gpt2_tokenizer(text, return_tensors="pt")
 
67
  outputs = gpt2_model(**encodings, labels=encodings["input_ids"])
68
  loss = outputs.loss
69
  ppl = math.exp(loss.item())
70
+ return 1.0 / (1.0 + ppl)
 
71
 
72
  def robust_ai_score(text, weights={"large":0.4, "base":0.4, "ppl":0.2}, threshold_adjust=0.95):
73
  score_large = ai_score_roberta(text, "roberta-large")
 
91
  }
92
 
93
  # -----------------------------
 
94
  # -----------------------------
95
+ # 2️⃣ PLAGIARISM CHECKER SETUP
96
+ # -----------------------------
97
+ # Load MiniLM
98
+ plag_model = SentenceTransformer('all-MiniLM-L6-v2')
99
+
100
+ # Helper functions
101
+ def clean_text(text):
102
+ return re.sub(r'\s+', ' ', text).strip()
103
+
104
+ async def fetch_web_paragraphs(url):
105
+ try:
106
+ headers = {'User-Agent': 'Mozilla/5.0'}
107
+ async with httpx.AsyncClient() as client:
108
+ r = await client.get(url, headers=headers, timeout=10)
109
+ if r.status_code != 200:
110
+ logging.warning(f"Failed to fetch URL: {url}, status: {r.status_code}")
111
+ return []
112
+ soup = BeautifulSoup(r.text, 'html.parser')
113
+ return [clean_text(p.get_text()) for p in soup.find_all('p') if p.get_text().strip()]
114
+ except Exception as e:
115
+ logging.error(f"Error fetching {url}: {str(e)}")
116
+ return []
117
+
118
+ async def get_search_urls(text, num_results=10):
119
+ urls = []
120
+ try:
121
+ urls = list(search(text, num_results=num_results, stop=num_results))
122
+ except Exception as e:
123
+ logging.warning(f"Google search failed: {str(e)}")
124
+ if len(urls) < num_results:
125
+ try:
126
+ with DDGS() as ddgs:
127
+ results = ddgs.text(text, max_results=num_results - len(urls))
128
+ urls += [r['href'] for r in results]
129
+ except Exception as e:
130
+ logging.warning(f"DuckDuckGo search failed: {str(e)}")
131
+ return urls
132
+
133
+ def hybrid_similarity(text1, text2):
134
+ emb1 = plag_model.encode(text1, convert_to_tensor=True)
135
+ emb2 = plag_model.encode(text2, convert_to_tensor=True)
136
+ return util.pytorch_cos_sim(emb1, emb2).item()
137
+
138
+ async def internet_plagiarism_score(input_text, num_results=10):
139
+ urls = await get_search_urls(input_text, num_results=num_results)
140
+ all_matches = []
141
+ for url in urls:
142
+ paragraphs = await fetch_web_paragraphs(url)
143
+ if not paragraphs:
144
+ continue
145
+ max_sim = max([hybrid_similarity(input_text, p) for p in paragraphs])
146
+ all_matches.append((url, max_sim))
147
+ await asyncio.sleep(0.5)
148
+ if not all_matches:
149
+ return {"score": 0, "matches": []}
150
+ top_matches = sorted(all_matches, key=lambda x: x[1], reverse=True)[:5]
151
+ avg_score = np.mean([sim for _, sim in top_matches])
152
+ return {
153
+ "score": round(avg_score * 100, 2),
154
+ "urls": [u for u, _ in top_matches]
155
+ }
156
+
157
+ def check_plagiarism_sync(text):
158
+ return asyncio.run(internet_plagiarism_score(text))
159
+
160
+ # -----------------------------
161
+ # -----------------------------
162
+ # 3️⃣ GRADIO UI
163
+ # -----------------------------
164
+ with gr.Blocks() as demo:
165
+ with gr.Tab("AI Detection"):
166
+ ai_input = gr.Textbox(lines=5, label="Enter text to analyze")
167
+ ai_output = gr.JSON(label="AI Detection Result")
168
+ ai_button = gr.Button("Analyze")
169
+ ai_button.click(fn=robust_ai_score, inputs=ai_input, outputs=ai_output)
170
+
171
+ with gr.Tab("Plagiarism Checker"):
172
+ plg_input = gr.Textbox(lines=5, label="Enter text to check plagiarism")
173
+ plg_output = gr.JSON(label="Plagiarism Result")
174
+ plg_button = gr.Button("Check Plagiarism")
175
+ plg_button.click(fn=check_plagiarism_sync, inputs=plg_input, outputs=plg_output)
176
 
177
  if __name__ == "__main__":
178
+ demo.launch(server_name="0.0.0.0", server_port=7860)