copyright_checker / plagiarism.py
minko186's picture
refactored plagiarism checker
029c7a1
raw
history blame
10.2 kB
import time
from nltk.tokenize import sent_tokenize
from googleapiclient.discovery import build
from collections import Counter
import re, math
from sentence_transformers import SentenceTransformer, util
import asyncio
import httpx
from bs4 import BeautifulSoup
import numpy as np
WORD = re.compile(r"\w+")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# returns cosine similarity of two vectors
# input: two vectors
# output: integer between 0 and 1.
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
# calculating numerator
numerator = sum([vec1[x] * vec2[x] for x in intersection])
# calculating denominator
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
# checking for divide by zero
if denominator == 0:
return 0.0
else:
return float(numerator) / denominator
# converts given text into a vector
def text_to_vector(text):
# uses the Regular expression above and gets all words
words = WORD.findall(text)
# returns a counter of all the words (count of number of occurences)
return Counter(words)
# returns cosine similarity of two words
# uses: text_to_vector(text) and get_cosine(v1,v2)
def cosineSim(text1, text2):
vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)
# print vector1,vector2
cosine = get_cosine(vector1, vector2)
return cosine
def cos_sim_torch(embedding_1, embedding_2):
return util.pytorch_cos_sim(embedding_1, embedding_2).item()
def embed_text(text):
return model.encode(text, convert_to_tensor=True)
def sentence_similarity(text1, text2):
embedding_1 = model.encode(text1, convert_to_tensor=True)
embedding_2 = model.encode(text2, convert_to_tensor=True)
o = util.pytorch_cos_sim(embedding_1, embedding_2)
return o.item()
def google_search(
plag_option,
sentences,
urlCount,
scoreArray,
urlList,
sorted_date,
domains_to_skip,
api_key,
cse_id,
**kwargs,
):
service = build("customsearch", "v1", developerKey=api_key)
for i, sentence in enumerate(sentences):
results = (
service.cse()
.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
.execute()
)
if "items" in results and len(results["items"]) > 0:
for count, link in enumerate(results["items"]):
# stop after 3 pages
if count >= 3:
break
# skip user selected domains
if any(
("." + domain) in link["link"] for domain in domains_to_skip
):
continue
# clean up snippet of '...'
snippet = link["snippet"]
ind = snippet.find("...")
if ind < 20 and ind > 9:
snippet = snippet[ind + len("... ") :]
ind = snippet.find("...")
if ind > len(snippet) - 5:
snippet = snippet[:ind]
# update cosine similarity between snippet and given text
url = link["link"]
if url not in urlList:
urlList.append(url)
scoreArray.append([0] * len(sentences))
urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
if plag_option == "Standard":
scoreArray[urlList.index(url)][i] = cosineSim(
sentence, snippet
)
else:
scoreArray[urlList.index(url)][i] = sentence_similarity(
sentence, snippet
)
return urlCount, scoreArray
def split_sentence_blocks(text):
sents = sent_tokenize(text)
two_sents = []
for i in range(len(sents)):
if (i % 2) == 0:
two_sents.append(sents[i])
else:
two_sents[len(two_sents) - 1] += " " + sents[i]
return two_sents
months = {
"January": "01",
"February": "02",
"March": "03",
"April": "04",
"May": "05",
"June": "06",
"July": "07",
"August": "08",
"September": "09",
"October": "10",
"November": "11",
"December": "12",
}
def build_date(year=2024, month="March", day=1):
return f"{year}{months[month]}{day}"
async def get_url_data(url, client):
try:
r = await client.get(url)
# print(r.status_code)
if r.status_code == 200:
# print("in")
soup = BeautifulSoup(r.content, "html.parser")
return soup
except Exception:
return None
def remove_punc(text):
res = re.sub(r"[^\w\s]", "", text)
return res
def split_ngrams(text, n):
# return n-grams of size n
words = text.split()
return [words[i : i + n] for i in range(len(words) - n + 1)]
async def parallel_scrap(urls):
async with httpx.AsyncClient(timeout=30) as client:
tasks = []
for url in urls:
tasks.append(get_url_data(url=url, client=client))
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
def matching_score(sentence, content):
sentence = remove_punc(sentence)
content = remove_punc(content)
if sentence in content:
return 1
else:
n = 5
ngrams = split_ngrams(sentence, n)
if len(ngrams) == 0:
return 0
matched = [x for x in ngrams if " ".join(x) in content]
return len(matched) / len(ngrams)
def plagiarism_check(
plag_option,
input,
year_from,
month_from,
day_from,
year_to,
month_to,
day_to,
domains_to_skip,
):
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
cse_id = "851813e81162b4ed4"
sentences = split_sentence_blocks(input)
urlCount = {}
ScoreArray = []
urlList = []
date_from = build_date(year_from, month_from, day_from)
date_to = build_date(year_to, month_to, day_to)
sort_date = f"date:r:{date_from}:{date_to}"
# get list of URLS to check
urlCount, ScoreArray = google_search(
plag_option,
sentences,
urlCount,
ScoreArray,
urlList,
sort_date,
domains_to_skip,
api_key,
cse_id,
)
# Scrape URLs in list
formatted_tokens = []
soups = asyncio.run(parallel_scrap(urlList))
# Populate matching scores for scrapped pages
for i, soup in enumerate(soups):
print(f"Analyzing {i+1} of {len(soups)} soups........................")
if soup:
page_content = soup.text
for j, sent in enumerate(sentences):
score = matching_score(sent, page_content)
score = matching_score(sent, page_content)
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
ScoreArray[i][j] = score
# *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
# source_embeddings = []
# for i, soup in enumerate(soups):
# if soup:
# page_content = soup.text
# source_embeddings.append(embed_text(page_content))
# else:
# source_embeddings.append(None)
# def compute_cosine_similarity(args):
# sent, source_embedding, i, j = args
# score = cos_sim_torch(embed_text(sent), source_embedding)
# return i, j, score
# def main(soups, sentences):
# source_embeddings = [preprocess(soup) for soup in soups]
# ScoreArray = [[0 for _ in sentences] for _ in soups]
# args_list = []
# for i, soup in enumerate(soups):
# if soup:
# for j, sent in enumerate(sentences):
# args_list.append((sent, source_embeddings[i], i, j))
# with concurrent.futures.ProcessPoolExecutor() as executor:
# results = executor.map(compute_cosine_similarity, args_list)
# for i, j, score in results:
# ScoreArray[i][j] = score
# return ScoreArray
# # Populate matching scores for scrapped pages
# ScoreArray = main(soups, sentences)
# *******************************************************************************************
# Calculate URL of max matching score for each sentence chunk
sentenceToMaxURL = [-1] * len(sentences)
for j in range(len(sentences)):
if j > 0:
maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
else:
maxScore = -1
for i in range(len(ScoreArray)):
margin = (
0.1
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
else 0
)
if ScoreArray[i][j] - maxScore > margin:
maxScore = ScoreArray[i][j]
sentenceToMaxURL[j] = i
index = np.unique(sentenceToMaxURL)
urlScore = {}
for url in index:
s = [
ScoreArray[url][sen]
for sen in range(len(sentences))
if sentenceToMaxURL[sen] == url
]
urlScore[url] = sum(s) / len(s)
index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
urlMap = {}
for count, i in enumerate(index_descending):
urlMap[i] = count + 1
for i, sent in enumerate(sentences):
formatted_tokens.append(
(sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
)
for ind in index_descending:
formatted_tokens.append(
(
urlList[ind]
+ " --- Matching Score: "
+ f"{str(round(urlScore[ind] * 100, 2))}%",
"[" + str(urlMap[ind]) + "]",
)
)
formatted_tokens.append(("\n", None))
return formatted_tokens