Spaces:

polygraf-ai
/

copyright_checker

Sleeping

App Files Files Community

aliasgerovs commited on Mar 1, 2024

Commit

25f5a14

2 Parent(s): 137dab1 6897d4d

Merge branch 'main' into demo

Browse files

Files changed (2) hide show

app.py +44 -23
utils.py +24 -10

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore
 import gradio as gr
 from urllib.request import urlopen, Request
 from googleapiclient.discovery import build
 import requests
 import httpx
 import re
 from bs4 import BeautifulSoup
 import numpy as np
@@ -20,7 +21,7 @@ import plotly.graph_objects as go
 import torch.nn.functional as F
 import nltk
 from unidecode import unidecode
 nltk.download('punkt')
@@ -54,9 +55,11 @@ def plagiarism_check(
     api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
-    api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
     cse_id = "851813e81162b4ed4"
     sentences = getSentences(input)
     urlCount = {}
     ScoreArray = []
@@ -78,12 +81,18 @@ def plagiarism_check(
         api_key,
         cse_id,
     )
     print("Number of URLs: ", len(urlCount))
     print(urlList)
     # Scrape URLs in list
     formatted_tokens = []
     soups = asyncio.run(parallel_scrap(urlList))
     print(len(soups))
     print(
         "Successful scraping: "
@@ -98,9 +107,13 @@ def plagiarism_check(
         if soup:
             page_content = soup.text
             for j, sent in enumerate(sentences):
-                score = matchingScore(sent, page_content)
                 ScoreArray[i][j] = score
     # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
     # print("New Score Array:\n")
     # print2D(ScoreArray)
@@ -176,6 +189,8 @@ def plagiarism_check(
     print(f"Formatted Tokens: {formatted_tokens}")
     return formatted_tokens
@@ -271,29 +286,35 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30,
     return decoded_segments
 def predict_quillbot(text):
-    tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)["input_ids"]
-    output = quillbot_model(tokenized_text)
-    output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
-    q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
-    return q_score
 def predict_bc(model, tokenizer, text):
-    tokens = text_bc_tokenizer(
-        text, padding='max_length', truncation=True, max_length=333, return_tensors="pt"
-    ).to(device)["input_ids"]
-    output = model(tokens)
-    output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
-    print("BC Score: ", output_norm)
-    return output_norm
 def predict_mc(model, tokenizer, text):
-    tokens = text_mc_tokenizer(
-        text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
-    ).to(device)["input_ids"]
-    output = model(tokens)
-    output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
-    print("MC Score: ", output_norm)
-    return output_norm
 def ai_generated_test(ai_option, input):

+from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore, matchingScoreWithTimeout
 import gradio as gr
 from urllib.request import urlopen, Request
 from googleapiclient.discovery import build
 import requests
 import httpx
+import torch
 import re
 from bs4 import BeautifulSoup
 import numpy as np
 import torch.nn.functional as F
 import nltk
 from unidecode import unidecode
+import time
 nltk.download('punkt')
     api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
+    # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
     cse_id = "851813e81162b4ed4"
+    time1 = time.perf_counter()
+    start = time.perf_counter()
     sentences = getSentences(input)
     urlCount = {}
     ScoreArray = []
         api_key,
         cse_id,
     )
+    print(f"Time for google search: {time.perf_counter()-time1}")
+    time1 = time.perf_counter()
     print("Number of URLs: ", len(urlCount))
     print(urlList)
     # Scrape URLs in list
     formatted_tokens = []
     soups = asyncio.run(parallel_scrap(urlList))
+    print(f"Time for scraping: {time.perf_counter()-time1}")
+    time1 = time.perf_counter()
     print(len(soups))
     print(
         "Successful scraping: "
         if soup:
             page_content = soup.text
             for j, sent in enumerate(sentences):
+                # score = matchingScore(sent, page_content)
+                score = matchingScoreWithTimeout(sent, page_content)
                 ScoreArray[i][j] = score
+    print(f"Time for matching score: {time.perf_counter()-time1}")
+    time1 = time.perf_counter()
     # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
     # print("New Score Array:\n")
     # print2D(ScoreArray)
     print(f"Formatted Tokens: {formatted_tokens}")
+    print(f"Time for plagiarism check: {time.perf_counter()-start}")
     return formatted_tokens
     return decoded_segments
 def predict_quillbot(text):
+    with torch.no_grad():
+        quillbot_model.eval()
+        tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)
+        output = quillbot_model(**tokenized_text)
+        output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
+        q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
+        return q_score
 def predict_bc(model, tokenizer, text):
+    with torch.no_grad():
+        model.eval()
+        tokens = text_bc_tokenizer(
+            text, padding='max_length', truncation=True, max_length=333, return_tensors="pt"
+        ).to(device)
+        output = model(**tokens)
+        output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
+        print("BC Score: ", output_norm)
+        return output_norm
 def predict_mc(model, tokenizer, text):
+    with torch.no_grad():
+        model.eval()
+        tokens = text_mc_tokenizer(
+            text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
+        ).to(device)
+        output = model(**tokens)
+        output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
+        print("MC Score: ", output_norm)
+        return output_norm
 def ai_generated_test(ai_option, input):

utils.py CHANGED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 import asyncio
 import nltk
 from sentence_transformers import SentenceTransformer, util
 nltk.download('punkt')
@@ -116,8 +117,8 @@ def googleSearch(
         )
         if "items" in results and len(results["items"]) > 0:
             for count, link in enumerate(results["items"]):
-                # stop after 5 pages
-                if count > 4:
                     break
                 # skip user selected domains
                 if any(
@@ -154,14 +155,8 @@ def googleSearch(
 def getQueries(text, n):
     # return n-grams of size n
-    finalq = []
     words = text.split()
-    l = len(words)
-    for i in range(0, l - n + 1):
-        finalq.append(words[i : i + n])
-    return finalq
 def print2D(array):
@@ -195,6 +190,11 @@ async def parallel_scrap(urls):
     return results
 def matchingScore(sentence, content):
     if sentence in content:
         return 1
@@ -208,7 +208,21 @@ def matchingScore(sentence, content):
         if len(ngrams) == 0:
             return 0
         matched = [x for x in ngrams if " ".join(x) in content]
-        return len(matched) / len(ngrams)
 async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):

 import asyncio
 import nltk
 from sentence_transformers import SentenceTransformer, util
+import threading
 nltk.download('punkt')
         )
         if "items" in results and len(results["items"]) > 0:
             for count, link in enumerate(results["items"]):
+                # stop after 3 pages
+                if count >= 3:
                     break
                 # skip user selected domains
                 if any(
 def getQueries(text, n):
     # return n-grams of size n
     words = text.split()
+    return [words[i : i + n] for i in range(len(words) - n + 1)]
 def print2D(array):
     return results
+class TimeoutError(Exception):
+    pass
 def matchingScore(sentence, content):
     if sentence in content:
         return 1
         if len(ngrams) == 0:
             return 0
         matched = [x for x in ngrams if " ".join(x) in content]
+    return len(matched) / len(ngrams)
+def matchingScoreWithTimeout(sentence, content):
+    def timeout_handler():
+        raise TimeoutError("Function timed out")
+    timer = threading.Timer(2, timeout_handler)  # Set a timer for 2 seconds
+    timer.start()
+    try:
+        score = matchingScore(sentence, content)
+        timer.cancel()  # Cancel the timer if calculation completes before timeout
+        return score
+    except TimeoutError:
+        return 0
 async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):