aliasgerovs commited on
Commit
25f5a14
·
2 Parent(s): 137dab1 6897d4d

Merge branch 'main' into demo

Browse files
Files changed (2) hide show
  1. app.py +44 -23
  2. utils.py +24 -10
app.py CHANGED
@@ -1,9 +1,10 @@
1
- from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore
2
  import gradio as gr
3
  from urllib.request import urlopen, Request
4
  from googleapiclient.discovery import build
5
  import requests
6
  import httpx
 
7
  import re
8
  from bs4 import BeautifulSoup
9
  import numpy as np
@@ -20,7 +21,7 @@ import plotly.graph_objects as go
20
  import torch.nn.functional as F
21
  import nltk
22
  from unidecode import unidecode
23
-
24
 
25
  nltk.download('punkt')
26
 
@@ -54,9 +55,11 @@ def plagiarism_check(
54
  api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
55
  api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
56
  api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
57
- api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
58
  cse_id = "851813e81162b4ed4"
59
 
 
 
60
  sentences = getSentences(input)
61
  urlCount = {}
62
  ScoreArray = []
@@ -78,12 +81,18 @@ def plagiarism_check(
78
  api_key,
79
  cse_id,
80
  )
 
 
 
81
  print("Number of URLs: ", len(urlCount))
82
  print(urlList)
83
 
84
  # Scrape URLs in list
85
  formatted_tokens = []
86
  soups = asyncio.run(parallel_scrap(urlList))
 
 
 
87
  print(len(soups))
88
  print(
89
  "Successful scraping: "
@@ -98,9 +107,13 @@ def plagiarism_check(
98
  if soup:
99
  page_content = soup.text
100
  for j, sent in enumerate(sentences):
101
- score = matchingScore(sent, page_content)
 
102
  ScoreArray[i][j] = score
103
 
 
 
 
104
  # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
105
  # print("New Score Array:\n")
106
  # print2D(ScoreArray)
@@ -176,6 +189,8 @@ def plagiarism_check(
176
 
177
  print(f"Formatted Tokens: {formatted_tokens}")
178
 
 
 
179
  return formatted_tokens
180
 
181
 
@@ -271,29 +286,35 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30,
271
  return decoded_segments
272
 
273
  def predict_quillbot(text):
274
- tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)["input_ids"]
275
- output = quillbot_model(tokenized_text)
276
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
277
- q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
278
- return q_score
 
 
279
 
280
  def predict_bc(model, tokenizer, text):
281
- tokens = text_bc_tokenizer(
282
- text, padding='max_length', truncation=True, max_length=333, return_tensors="pt"
283
- ).to(device)["input_ids"]
284
- output = model(tokens)
285
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
286
- print("BC Score: ", output_norm)
287
- return output_norm
 
 
288
 
289
  def predict_mc(model, tokenizer, text):
290
- tokens = text_mc_tokenizer(
291
- text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
292
- ).to(device)["input_ids"]
293
- output = model(tokens)
294
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
295
- print("MC Score: ", output_norm)
296
- return output_norm
 
 
297
 
298
  def ai_generated_test(ai_option, input):
299
 
 
1
+ from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore, matchingScoreWithTimeout
2
  import gradio as gr
3
  from urllib.request import urlopen, Request
4
  from googleapiclient.discovery import build
5
  import requests
6
  import httpx
7
+ import torch
8
  import re
9
  from bs4 import BeautifulSoup
10
  import numpy as np
 
21
  import torch.nn.functional as F
22
  import nltk
23
  from unidecode import unidecode
24
+ import time
25
 
26
  nltk.download('punkt')
27
 
 
55
  api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
56
  api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
57
  api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
58
+ # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
59
  cse_id = "851813e81162b4ed4"
60
 
61
+ time1 = time.perf_counter()
62
+ start = time.perf_counter()
63
  sentences = getSentences(input)
64
  urlCount = {}
65
  ScoreArray = []
 
81
  api_key,
82
  cse_id,
83
  )
84
+ print(f"Time for google search: {time.perf_counter()-time1}")
85
+ time1 = time.perf_counter()
86
+
87
  print("Number of URLs: ", len(urlCount))
88
  print(urlList)
89
 
90
  # Scrape URLs in list
91
  formatted_tokens = []
92
  soups = asyncio.run(parallel_scrap(urlList))
93
+
94
+ print(f"Time for scraping: {time.perf_counter()-time1}")
95
+ time1 = time.perf_counter()
96
  print(len(soups))
97
  print(
98
  "Successful scraping: "
 
107
  if soup:
108
  page_content = soup.text
109
  for j, sent in enumerate(sentences):
110
+ # score = matchingScore(sent, page_content)
111
+ score = matchingScoreWithTimeout(sent, page_content)
112
  ScoreArray[i][j] = score
113
 
114
+ print(f"Time for matching score: {time.perf_counter()-time1}")
115
+ time1 = time.perf_counter()
116
+
117
  # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
118
  # print("New Score Array:\n")
119
  # print2D(ScoreArray)
 
189
 
190
  print(f"Formatted Tokens: {formatted_tokens}")
191
 
192
+ print(f"Time for plagiarism check: {time.perf_counter()-start}")
193
+
194
  return formatted_tokens
195
 
196
 
 
286
  return decoded_segments
287
 
288
  def predict_quillbot(text):
289
+ with torch.no_grad():
290
+ quillbot_model.eval()
291
+ tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)
292
+ output = quillbot_model(**tokenized_text)
293
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
294
+ q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
295
+ return q_score
296
 
297
  def predict_bc(model, tokenizer, text):
298
+ with torch.no_grad():
299
+ model.eval()
300
+ tokens = text_bc_tokenizer(
301
+ text, padding='max_length', truncation=True, max_length=333, return_tensors="pt"
302
+ ).to(device)
303
+ output = model(**tokens)
304
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
305
+ print("BC Score: ", output_norm)
306
+ return output_norm
307
 
308
  def predict_mc(model, tokenizer, text):
309
+ with torch.no_grad():
310
+ model.eval()
311
+ tokens = text_mc_tokenizer(
312
+ text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
313
+ ).to(device)
314
+ output = model(**tokens)
315
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
316
+ print("MC Score: ", output_norm)
317
+ return output_norm
318
 
319
  def ai_generated_test(ai_option, input):
320
 
utils.py CHANGED
@@ -10,6 +10,7 @@ import numpy as np
10
  import asyncio
11
  import nltk
12
  from sentence_transformers import SentenceTransformer, util
 
13
 
14
  nltk.download('punkt')
15
 
@@ -116,8 +117,8 @@ def googleSearch(
116
  )
117
  if "items" in results and len(results["items"]) > 0:
118
  for count, link in enumerate(results["items"]):
119
- # stop after 5 pages
120
- if count > 4:
121
  break
122
  # skip user selected domains
123
  if any(
@@ -154,14 +155,8 @@ def googleSearch(
154
 
155
  def getQueries(text, n):
156
  # return n-grams of size n
157
- finalq = []
158
  words = text.split()
159
- l = len(words)
160
-
161
- for i in range(0, l - n + 1):
162
- finalq.append(words[i : i + n])
163
-
164
- return finalq
165
 
166
 
167
  def print2D(array):
@@ -195,6 +190,11 @@ async def parallel_scrap(urls):
195
  return results
196
 
197
 
 
 
 
 
 
198
  def matchingScore(sentence, content):
199
  if sentence in content:
200
  return 1
@@ -208,7 +208,21 @@ def matchingScore(sentence, content):
208
  if len(ngrams) == 0:
209
  return 0
210
  matched = [x for x in ngrams if " ".join(x) in content]
211
- return len(matched) / len(ngrams)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
 
214
  async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
 
10
  import asyncio
11
  import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
+ import threading
14
 
15
  nltk.download('punkt')
16
 
 
117
  )
118
  if "items" in results and len(results["items"]) > 0:
119
  for count, link in enumerate(results["items"]):
120
+ # stop after 3 pages
121
+ if count >= 3:
122
  break
123
  # skip user selected domains
124
  if any(
 
155
 
156
  def getQueries(text, n):
157
  # return n-grams of size n
 
158
  words = text.split()
159
+ return [words[i : i + n] for i in range(len(words) - n + 1)]
 
 
 
 
 
160
 
161
 
162
  def print2D(array):
 
190
  return results
191
 
192
 
193
+ class TimeoutError(Exception):
194
+ pass
195
+
196
+
197
+
198
  def matchingScore(sentence, content):
199
  if sentence in content:
200
  return 1
 
208
  if len(ngrams) == 0:
209
  return 0
210
  matched = [x for x in ngrams if " ".join(x) in content]
211
+ return len(matched) / len(ngrams)
212
+
213
+
214
+ def matchingScoreWithTimeout(sentence, content):
215
+ def timeout_handler():
216
+ raise TimeoutError("Function timed out")
217
+
218
+ timer = threading.Timer(2, timeout_handler) # Set a timer for 2 seconds
219
+ timer.start()
220
+ try:
221
+ score = matchingScore(sentence, content)
222
+ timer.cancel() # Cancel the timer if calculation completes before timeout
223
+ return score
224
+ except TimeoutError:
225
+ return 0
226
 
227
 
228
  async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):