Spaces:

polygraf-ai
/

copyright_checker

Running

File size: 16,683 Bytes

029c7a1
 
 
 
 
 
 
 
 
 
45d10c4
c78ec74
0eaca07
 
029c7a1
 
 
 
 
350b1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
029c7a1
350b1a0
 
8e79582
 
 
350b1a0
 
8e79582
 
350b1a0
 
 
 
029c7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350b1a0
 
 
 
 
 
 
 
 
 
029c7a1
 
9c75413
 
0eaca07
9c75413
 
0eaca07
2b72059
029c7a1
 
 
 
 
 
350b1a0
 
 
029c7a1
 
350b1a0
 
 
 
 
029c7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0eaca07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c78ec74
d03ef17
029c7a1
0eaca07
 
 
029c7a1
 
d03ef17
8e79582
 
d03ef17
8e79582
 
 
 
 
0eaca07
 
 
 
 
 
 
 
 
 
 
 
 
 
8e79582
0eaca07
 
 
 
 
 
 
d03ef17
c0a6bc9
c78ec74
74f95a7
c78ec74
 
c0a6bc9
 
7ec48d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0eaca07
 
 
 
 
 
 
 
350b1a0
c0a6bc9
350b1a0
 
 
 
0eaca07
350b1a0
c0a6bc9
350b1a0
 
 
c0a6bc9
350b1a0
8e79582
350b1a0
 
 
 
 
 
 
 
8e79582
350b1a0
 
9c75413
350b1a0
 
 
 
 
 
 
 
 
 
 
c0a6bc9
350b1a0
 
 
 
 
0eaca07
350b1a0
0eaca07
350b1a0
 
 
 
 
 
 
 
 
c0a6bc9
 
029c7a1
 
 
 
 
 
 
 
 
 
9c75413
029c7a1
8e79582
0eaca07
350b1a0
 
af21e05
8fb8d86
413cf6e
029c7a1
 
c0a6bc9
 
9c75413
c0a6bc9
 
 
0eaca07
029c7a1
 
 
 
c0a6bc9
029c7a1
 
c0a6bc9
 
 
0eaca07
029c7a1
 
 
 
 
 
c0a6bc9
c78ec74
 
 
 
 
d03ef17
c78ec74
0eaca07
 
 
 
c0a6bc9
 
 
c78ec74
 
 
0eaca07
 
c0a6bc9
 
7ec48d6
029c7a1
 
c0a6bc9
029c7a1
 
c0a6bc9
029c7a1
 
 
c0a6bc9
 
029c7a1
 
 
c0a6bc9
7ec48d6
029c7a1
c0a6bc9
 
 
0eaca07
 
 
 
 
 
029c7a1
c0a6bc9
7ec48d6
0eaca07
 
c0a6bc9
8fb8d86
0eaca07
 
 
 
 
 
 
 
 
7ec48d6
0eaca07
 
 
 
 
 
7ec48d6
029c7a1
c0a6bc9
350b1a0
 
 
 
 
 
 
 
 
 
 
 
9c75413
350b1a0
d03ef17
350b1a0
 
 
 
 
 
 
 
 
 
9c75413
350b1a0
 
0eaca07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350b1a0
 
0eaca07
 
8fb8d86
0eaca07
 
 
 
 
 
8fb8d86
0eaca07
8fb8d86
350b1a0
 
0eaca07
 
350b1a0
 
 
 
 
8fb8d86
0eaca07
 
 
 
 
 
350b1a0
 
0eaca07
 
 
 
350b1a0
 
 
0eaca07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350b1a0
0eaca07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350b1a0
 
0eaca07
9c75413
d03ef17
 
0eaca07

import time
from nltk.tokenize import sent_tokenize
from googleapiclient.discovery import build
from collections import Counter
import re, math
from sentence_transformers import SentenceTransformer, util
import asyncio
import httpx
from bs4 import BeautifulSoup
import numpy as np
import concurrent
from multiprocessing import Pool
from const import url_types
from collections import defaultdict

WORD = re.compile(r"\w+")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


months = {
    "January": "01",
    "February": "02",
    "March": "03",
    "April": "04",
    "May": "05",
    "June": "06",
    "July": "07",
    "August": "08",
    "September": "09",
    "October": "10",
    "November": "11",
    "December": "12",
}

color_map = [
    "#cf2323",
    "#d65129",
    "#d66329",
    "#d67129",
    "#eb9d59",
    "#c2ad36",
    "#d6ae29",
    "#d6b929",
    "#e1ed72",
    "#c2db76",
    "#a2db76",
]


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)


def cosineSim(text1, text2):
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)
    # print vector1,vector2
    cosine = get_cosine(vector1, vector2)
    return cosine


def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
    sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if denominator == 0:
        return 0.0
    else:
        return float(numerator) / denominator


def split_sentence_blocks(text, size):
    if size == "Paragraph":
        blocks = text.strip().split("\n")
        return blocks
    else:
        sents = sent_tokenize(text.strip())
        return sents


def build_date(year=2024, month="March", day=1):
    return f"{year}{months[month]}{day}"


def split_ngrams(text, n):
    words = text.split()
    return [words[i : i + n] for i in range(len(words) - n + 1)]


def sentence_similarity(text1, text2):
    embedding_1 = model.encode(text1, convert_to_tensor=True)
    embedding_2 = model.encode(text2, convert_to_tensor=True)
    o = util.pytorch_cos_sim(embedding_1, embedding_2)
    return o.item()


async def get_url_data(url, client):
    try:
        r = await client.get(url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "html.parser")
            return soup
    except Exception:
        return None


async def parallel_scrap(urls):
    async with httpx.AsyncClient(timeout=30) as client:
        tasks = []
        for url in urls:
            tasks.append(get_url_data(url=url, client=client))
        results = await asyncio.gather(*tasks, return_exceptions=True)
    return results


def merge_ngrams_into_sentence(ngrams):
    if ngrams == None:
        return ""
    if len(ngrams) > 20:
        ngrams = ngrams[:20]
    merged_sentence = []
    i = 0
    for ngram in ngrams:
        overlap = len(set(ngram) & set(merged_sentence[-len(ngram) :]))
        if overlap == 0:
            merged_sentence.extend(ngram)
        elif overlap < len(ngram):
            merged_sentence.extend(ngram[overlap:])
    return " ".join(merged_sentence)


def remove_ngrams_after(ngrams, target_ngram):
    try:
        index = ngrams.index(target_ngram)
        return ngrams[: index + 1]
    except ValueError:
        return None


def matching_score(sentence_content_tuple):
    sentence, content, score = sentence_content_tuple
    if sentence in content:
        return 1, sentence
    # if score > 0.9:
    #     return score
    else:
        n = 5

        # ngrams = split_ngrams(sentence, n)
        # if len(ngrams) == 0:
        #     return 0
        # matched = [x for x in ngrams if " ".join(x) in content]
        # return len(matched) / len(ngrams)

        ngrams_sentence = split_ngrams(sentence, n)
        if len(ngrams_sentence) == 0:
            return 0, ""
        ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
        matched_content_ngrams = []
        found = False
        last_found = None
        for ngram in ngrams_sentence:
            for ngram_content in ngrams_content:
                if tuple(ngram) == ngram_content:
                    found = True
                    last_found = ngram_content
                if found:
                    matched_content_ngrams.append(ngram_content)
        matched_content_ngrams = remove_ngrams_after(
            matched_content_ngrams, last_found
        )
        matched_content = merge_ngrams_into_sentence(matched_content_ngrams)

        matched_ngrams = [
            1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
        ]
        matched_count = sum(matched_ngrams)
        return matched_count / len(ngrams_sentence), matched_content


def process_with_multiprocessing(input_data):
    with Pool(processes=8) as pool:
        scores = pool.map(matching_score, input_data)
    return scores


def map_sentence_url(sentences, score_array):
    sentenceToMaxURL = [-1] * len(sentences)
    for j in range(len(sentences)):
        if j > 0:
            maxScore = score_array[sentenceToMaxURL[j - 1]][j]
            sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
        else:
            maxScore = -1
        for i in range(len(score_array)):
            margin = (
                0.05
                if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
                else 0
            )
            if score_array[i][j] - maxScore > margin:
                maxScore = score_array[i][j]
                sentenceToMaxURL[j] = i
    return sentenceToMaxURL


def check_url_category(url):
    for category, urls in url_types.items():
        for u in urls:
            if u in url:
                return category
    return "Internet Source"


def google_search(
    plag_option,
    sentences,
    url_count,
    score_array,
    url_list,
    snippets,
    sorted_date,
    domains_to_skip,
    api_key,
    cse_id,
    **kwargs,
):
    service = build("customsearch", "v1", developerKey=api_key)
    num_pages = 3
    for i, sentence in enumerate(sentences):
        results = (
            service.cse()
            .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
            .execute()
        )
        if "items" in results and len(results["items"]) > 0:
            for count, link in enumerate(results["items"]):
                if count >= num_pages:
                    break
                # skip user selected domains
                if (domains_to_skip is not None) and any(
                    ("." + domain) in link["link"] for domain in domains_to_skip
                ):
                    continue
                # clean up snippet of '...'
                snippet = link["snippet"]
                ind = snippet.find("...")
                if ind < 20 and ind > 9:
                    snippet = snippet[ind + len("... ") :]
                ind = snippet.find("...")
                if ind > len(snippet) - 5:
                    snippet = snippet[:ind]

                # update cosine similarity between snippet and given text
                url = link["link"]
                if url not in url_list:
                    url_list.append(url)
                    score_array.append([0] * len(sentences))
                    snippets.append([""] * len(sentences))
                url_count[url] = url_count[url] + 1 if url in url_count else 1
                snippets[url_list.index(url)][i] = snippet
                if plag_option == "Standard":
                    score_array[url_list.index(url)][i] = cosineSim(
                        sentence, snippet
                    )
                else:
                    score_array[url_list.index(url)][i] = sentence_similarity(
                        sentence, snippet
                    )
    return url_count, score_array


def plagiarism_check(
    plag_option,
    input,
    year_from,
    month_from,
    day_from,
    year_to,
    month_to,
    day_to,
    domains_to_skip,
    source_block_size,
):
    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
    # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
    # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
    api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
    cse_id = "851813e81162b4ed4"

    url_scores = []
    sentence_scores = []
    sentences = split_sentence_blocks(input, source_block_size)
    url_count = {}
    score_array = []
    url_list = []
    snippets = []
    date_from = build_date(year_from, month_from, day_from)
    date_to = build_date(year_to, month_to, day_to)
    sort_date = f"date:r:{date_from}:{date_to}"
    # get list of URLS to check
    url_count, score_array = google_search(
        plag_option,
        sentences,
        url_count,
        score_array,
        url_list,
        snippets,
        sort_date,
        domains_to_skip,
        api_key,
        cse_id,
    )
    # Scrape URLs in list
    soups = asyncio.run(parallel_scrap(url_list))
    input_data = []
    for i, soup in enumerate(soups):
        if soup:
            page_content = soup.text
            for j, sent in enumerate(sentences):
                input_data.append((sent, page_content, score_array[i][j]))
    scores = process_with_multiprocessing(input_data)
    matched_sentence_array = [
        ["" for _ in range(len(score_array[0]))]
        for _ in range(len(score_array))
    ]

    k = 0
    # Update score array for each (soup, sentence)
    for i, soup in enumerate(soups):
        if soup:
            for j, _ in enumerate(sentences):
                score_array[i][j] = scores[k][0]
                matched_sentence_array[i][j] = scores[k][1]
                k += 1

    sentenceToMaxURL = map_sentence_url(sentences, score_array)
    index = np.unique(sentenceToMaxURL)

    url_source = {}
    for url in index:
        s = [
            score_array[url][sen]
            for sen in range(len(sentences))
            if sentenceToMaxURL[sen] == url
        ]
        url_source[url] = sum(s) / len(s)
    index_descending = sorted(url_source, key=url_source.get, reverse=True)
    urlMap = {}
    for count, i in enumerate(index_descending):
        urlMap[i] = count + 1

    # build results
    for i, sent in enumerate(sentences):
        ind = sentenceToMaxURL[i]
        if url_source[ind] > 0.1:
            sentence_scores.append(
                [
                    sent,
                    round(url_source[ind] * 100, 2),
                    url_list[ind],
                    urlMap[ind],
                ]
            )
        else:
            sentence_scores.append([sent, None, url_list[ind], -1])
    print("SNIPPETS: ", snippets)
    snippets = [[item for item in sublist if item] for sublist in snippets]
    for ind in index_descending:
        if url_source[ind] > 0.1:
            matched_sentence_array = [
                [item for item in sublist if item]
                for sublist in matched_sentence_array
            ]
            matched_sentence = "...".join(
                [sent for sent in matched_sentence_array[ind]]
            )
            if matched_sentence == "":
                matched_sentence = "...".join([sent for sent in snippets[ind]])
            url_scores.append(
                [
                    url_list[ind],
                    round(url_source[ind] * 100, 2),
                    urlMap[ind],
                    matched_sentence,
                ]
            )

    return sentence_scores, url_scores


def html_highlight(
    plag_option,
    input,
    year_from,
    month_from,
    day_from,
    year_to,
    month_to,
    day_to,
    domains_to_skip,
    source_block_size,
):
    start_time = time.perf_counter()
    sentence_scores, url_scores = plagiarism_check(
        plag_option,
        input,
        year_from,
        month_from,
        day_from,
        year_to,
        month_to,
        day_to,
        domains_to_skip,
        source_block_size,
    )

    html_content = """
        <link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>
        <div style='font-family: {font}; border: 2px solid black; padding: 10px; color: #FFFFFF;'>
        <html>
        <head>
            <title>Toggle Details</title>
            <style>
                .score-container {
                    display: flex;
                    justify-content: space-around;
                    align-items: left;
                    padding: 20px;
                }
                .score-item {
                    text-align: center;
                    padding: 10px;
                    background-color: #636362;
                    border-radius: 5px;
                    flex-grow: 1;
                    margin: 0 5px;
                }
                .details {
                    display: none;
                    padding: 10px;
                }
                .url-link {
                    font-size: 1.2em;
                }
                .url-link span {
                    margin-right: 10px;
                }
                .toggle-button {
                    color: #333;
                    border: none;
                    padding: 5px 10px;
                    text-align: center;
                    text-decoration: none;
                    display: inline-block;
                    cursor: pointer;
                }
            </style>
        </head>
    """

    prev_idx = None
    combined_sentence = ""
    total_score = 0
    total_count = 0
    category_scores = defaultdict(set)
    for sentence, score, url, idx in sentence_scores:
        category = check_url_category(url)
        if score is None:
            total_score += 0
        else:
            total_score += score
            category_scores[category].add(score)
        total_count += 1

        if idx != prev_idx and prev_idx is not None:
            color = color_map[prev_idx - 1]
            index_part = f"<span>[{prev_idx}]</span>"
            formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
            html_content += formatted_sentence
            combined_sentence = ""
        combined_sentence += " " + sentence
        prev_idx = idx

    print(category_scores)
    total_average_score = round(total_score / total_count, 2)
    category_averages = {
        category: round((sum(scores) / len(scores)), 2)
        for category, scores in category_scores.items()
    }

    if combined_sentence:
        color = color_map[prev_idx - 1]
        index_part = ""
        if prev_idx != -1:
            index_part = f"<span>[{prev_idx}]</span>"
        formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
        html_content += formatted_sentence

    html_content += "<hr>"

    html_content += f"""
        <div class="score-container">
        <div class="score-item">
            <h3>Overall Similarity</h3>
            <p>{total_average_score}%</p>
        </div>
    """
    for category, score in category_averages.items():
        html_content += f"""
            <div class="score-item"><h3>{category}</h3><p>{score}%</p></div>
        """
    html_content += "</div>"

    for url, score, idx, sentence in url_scores:
        url_category = check_url_category(url)
        color = color_map[idx - 1]
        formatted_url = f"""
            <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p><i>{url_category}</i></p>
            <p> --- <b>Matching Score: </b>{score}%</p>
            <p> --- <b>Original Source Content: </b>{sentence}</p>
        """
        # formatted_url = f"""
        #     <div class="url-link">
        #         <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p>{url_category}</p>
        #         <a href="#" onclick="toggleDetails(event)" class="toggle-button">&gt;</a>
        #     </div>
        #     <div id="detailsContainer" class="details">
        #         <p> --- <b>Matching Score: </b>{score}%</p>
        #         <p> --- <b>Original Source Content: </b>{sentence}</p>
        #     </div>
        # """
        html_content += formatted_url

    html_content += "</html>"

    print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)

    return html_content