Spaces:

polygraf-ai
/

copyright_checker

Sleeping

App Files Files Community

aliasgerovs commited on Apr 12, 2024

Commit

24bfeaf

2 Parent(s): 79b97e2 00732d6

Merge branch 'main' to into demo

Browse files

Files changed (6) hide show

analysis.py +172 -72
app.py +73 -12
explainability.py +0 -119
plagiarism.py +141 -92
requirements.txt +4 -1
writing_analysis.py +138 -65

analysis.py CHANGED Viewed

@@ -1,31 +1,42 @@
-import requests
-import httpx
-import torch
-import re
-from bs4 import BeautifulSoup
-import numpy as np
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import asyncio
-from scipy.special import softmax
-from evaluate import load
-from datetime import date
-import nltk
-import fitz
-from transformers import GPT2LMHeadModel, GPT2TokenizerFast
-import nltk, spacy, subprocess, torch
-import plotly.graph_objects as go
-import torch.nn.functional as F
-import nltk
-from unidecode import unidecode
-import time
 import yaml
-import nltk
-import os
-from explainability import *
 import subprocess
 nltk.download("punkt")
 nltk.download("stopwords")
 with open("config.yaml", "r") as file:
     params = yaml.safe_load(file)
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -33,64 +44,153 @@ readability_model_id = params["READABILITY_MODEL_ID"]
 gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
 gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
-command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
-subprocess.run(command)
-nlp = spacy.load("en_core_web_sm")
 def depth_analysis(input_text):
-    processed_words = preprocess_text1(input_text)
-    ttr_value = vocabulary_richness_ttr(processed_words)
-    gunning_fog = calculate_gunning_fog(input_text)
-    gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
-    words, sentences = preprocess_text2(input_text)
-    average_sentence_length = calculate_average_sentence_length(sentences)
-    average_word_length = calculate_average_word_length(words)
-    average_sentence_length_norm = normalize(
-        average_sentence_length, min_value=0, max_value=40
     )
-    average_word_length_norm = normalize(
-        average_word_length, min_value=0, max_value=8
     )
-    average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
-    average_tree_depth_norm = normalize(
-        average_tree_depth, min_value=0, max_value=10
     )
-    perplexity = calculate_perplexity(
-        input_text, gpt2_model, gpt2_tokenizer, device
     )
-    perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
     features = {
-        "readability": gunning_fog_norm,
-        "syntactic tree depth": average_tree_depth_norm,
-        "vocabulary richness": ttr_value,
-        "perplexity": perplexity_norm,
-        "average sentence length": average_sentence_length_norm,
-        "average word length": average_word_length_norm,
     }
-    fig = go.Figure()
-    fig.add_trace(
-        go.Scatterpolar(
-            r=list(features.values()),
-            theta=list(features.keys()),
-            fill="toself",
-            name="Radar Plot",
-        )
-    )
-    fig.update_layout(
-        polar=dict(
-            radialaxis=dict(
-                visible=True,
-                range=[0, 100],
-            )
-        ),
-        showlegend=False,
-        margin=dict(
-            l=10,
-            r=20,
-            b=10,
-            t=10,
-        ),
     )
     return fig

 import yaml
 import subprocess
+import nltk
+from nltk import word_tokenize
+from nltk.corpus import cmudict, stopwords
+import spacy
+import torch
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.patches import Circle, RegularPolygon
+from matplotlib.path import Path
+from matplotlib.projections import register_projection
+from matplotlib.projections.polar import PolarAxes
+from matplotlib.spines import Spine
+from matplotlib.transforms import Affine2D
+from writing_analysis import (
+    estimated_slightly_difficult_words_ratio,
+    entity_density,
+    determiners_frequency,
+    punctuation_diversity,
+    type_token_ratio,
+    calculate_perplexity,
+    calculate_syntactic_tree_depth,
+    hapax_legomena_ratio,
+    mtld,
+)
+nltk.download("cmudict")
 nltk.download("punkt")
 nltk.download("stopwords")
+nltk.download("wordnet")
+d = cmudict.dict()
+command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
+subprocess.run(command)
+nlp = spacy.load("en_core_web_sm")
 with open("config.yaml", "r") as file:
     params = yaml.safe_load(file)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
 gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
+def normalize(value, min_value, max_value):
+    normalized_value = ((value - min_value) * 100) / (max_value - min_value)
+    return max(0, min(100, normalized_value))
 def depth_analysis(input_text):
+    usual_ranges = {
+        "estimated_slightly_difficult_words_ratio": (
+            0.2273693623058005,
+            0.557383692351033,
+        ),
+        "entity_density": (-0.07940776754145815, 0.23491038179986615),
+        "determiners_frequency": (0.012461059190031154, 0.15700934579439252),
+        "punctuation_diversity": (-0.21875, 0.53125),
+        "type_token_ratio": (0.33002482852189063, 1.0894414982357028),
+        "calculate_perplexity": (-25.110544681549072, 82.4620680809021),
+        "calculate_syntactic_tree_depth": (1.8380681818181812, 10.997159090909092),
+        "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
+        "mtld": (-84.03125000000001, 248.81875000000002),
+    }
+    vocabulary_level = estimated_slightly_difficult_words_ratio(input_text, d)
+    entity_ratio = entity_density(input_text, nlp)
+    determiner_use = determiners_frequency(input_text, nlp)
+    punctuation_variety = punctuation_diversity(input_text)
+    sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
+    perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
+    lexical_diversity = type_token_ratio(input_text)
+    unique_words = hapax_legomena_ratio(input_text)
+    vocabulary_stability = mtld(input_text)
+    # normalize between 0 and 100
+    vocabulary_level_norm = normalize(
+        vocabulary_level, *usual_ranges["estimated_slightly_difficult_words_ratio"]
+    )
+    entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
+    determiner_use_norm = normalize(
+        determiner_use, *usual_ranges["determiners_frequency"]
     )
+    punctuation_variety_norm = normalize(
+        punctuation_variety, *usual_ranges["punctuation_diversity"]
     )
+    lexical_diversity_norm = normalize(
+        lexical_diversity, *usual_ranges["type_token_ratio"]
     )
+    unique_words_norm = normalize(unique_words, *usual_ranges["hapax_legomena_ratio"])
+    vocabulary_stability_norm = normalize(vocabulary_stability, *usual_ranges["mtld"])
+    sentence_depth_norm = normalize(
+        sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
     )
+    perplexity_norm = normalize(perplexity, *usual_ranges["calculate_perplexity"])
     features = {
+        "Lexical Diversity": lexical_diversity_norm,
+        "Vocabulary Level": vocabulary_level_norm,
+        "Unique Words": unique_words_norm,
+        "Determiner Use": determiner_use_norm,
+        "Punctuation Variety": punctuation_variety_norm,
+        "Sentence Depth": sentence_depth_norm,
+        "Vocabulary Stability": vocabulary_stability_norm,
+        "Entity Ratio": entity_ratio_norm,
+        "Perplexity": perplexity_norm,
     }
+    def radar_factory(num_vars, frame="circle"):
+        theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)
+        class RadarTransform(PolarAxes.PolarTransform):
+            def transform_path_non_affine(self, path):
+                if path._interpolation_steps > 1:
+                    path = path.interpolated(num_vars)
+                return Path(self.transform(path.vertices), path.codes)
+        class RadarAxes(PolarAxes):
+            name = "radar"
+            PolarTransform = RadarTransform
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.set_theta_zero_location("N")
+            def fill(self, *args, closed=True, **kwargs):
+                return super().fill(closed=closed, *args, **kwargs)
+            def plot(self, *args, **kwargs):
+                lines = super().plot(*args, **kwargs)
+                for line in lines:
+                    self._close_line(line)
+            def _close_line(self, line):
+                x, y = line.get_data()
+                if x[0] != x[-1]:
+                    x = np.append(x, x[0])
+                    y = np.append(y, y[0])
+                    line.set_data(x, y)
+            def set_varlabels(self, labels):
+                self.set_thetagrids(np.degrees(theta), labels)
+            def _gen_axes_patch(self):
+                if frame == "circle":
+                    return Circle((0.5, 0.5), 0.5)
+                elif frame == "polygon":
+                    return RegularPolygon(
+                        (0.5, 0.5), num_vars, radius=0.5, edgecolor="k"
+                    )
+            def _gen_axes_spines(self):
+                if frame == "polygon":
+                    spine = Spine(
+                        axes=self,
+                        spine_type="circle",
+                        path=Path.unit_regular_polygon(num_vars),
+                    )
+                    spine.set_transform(
+                        Affine2D().scale(0.5).translate(0.5, 0.5) + self.transAxes
+                    )
+                    return {"polar": spine}
+        register_projection(RadarAxes)
+        return theta
+    N = 9
+    theta = radar_factory(N, frame="polygon")
+    data = features.values()
+    labels = features.keys()
+    fig, ax = plt.subplots(subplot_kw=dict(projection="radar"), figsize=(7.5, 5))
+    ax.plot(theta, data)
+    ax.fill(theta, data, alpha=0.4)
+    ax.set_varlabels(labels)
+    rgrids = np.linspace(0, 100, num=6)
+    ax.set_rgrids(
+        rgrids, labels=[f"{round(r)}%" for r in rgrids], fontsize=8, color="black"
     )
+    ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
+    for dd, (label, value) in enumerate(zip(labels, data)):
+        ax.text(
+            theta[dd] + 0.1,
+            value + 5,
+            f"{value:.0f}",
+            horizontalalignment="left",
+            verticalalignment="bottom",
+            fontsize=8,
+        )
     return fig

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from predictors import predict_bc_scores, predict_mc_scores
 from predictors import update, correct_text, split_text
 from analysis import depth_analysis
 from predictors import predict_quillbot
-from plagiarism import plagiarism_check, build_date
 from highlighter import analyze_and_highlight
 from utils import extract_text_from_pdf, len_validator
 import yaml
@@ -21,7 +21,9 @@ model_list = params["MC_OUTPUT_LABELS"]
 analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
-analyze_and_highlight_quillbot = partial(analyze_and_highlight, model_type="quillbot")
 def ai_generated_test(option, input, models):
@@ -47,7 +49,18 @@ def main(
     domains_to_skip,
 ):
-    formatted_tokens = plagiarism_check(
         plag_option,
         input,
         year_from,
@@ -218,20 +231,67 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            sentenceBreakdown = gr.HighlightedText(
                 label="Source Detection Sentence Breakdown",
-                combine_adjacent=True,
-                color_map={
-                    "[1]": "red",
-                    "[2]": "orange",
-                    "[3]": "yellow",
-                    "[4]": "green",
-                },
             )
     with gr.Row():
         with gr.Column():
             writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
     full_check_btn.click(
         fn=main,
@@ -275,7 +335,8 @@ with gr.Blocks() as demo:
     )
     only_plagiarism_btn.click(
-        fn=plagiarism_check,
         inputs=[
             plag_option,
             input_text,

 from predictors import update, correct_text, split_text
 from analysis import depth_analysis
 from predictors import predict_quillbot
+from plagiarism import plagiarism_check, build_date, html_highlight
 from highlighter import analyze_and_highlight
 from utils import extract_text_from_pdf, len_validator
 import yaml
 analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
+analyze_and_highlight_quillbot = partial(
+    analyze_and_highlight, model_type="quillbot"
+)
 def ai_generated_test(option, input, models):
     domains_to_skip,
 ):
+    # formatted_tokens = plagiarism_check(
+    #     plag_option,
+    #     input,
+    #     year_from,
+    #     month_from,
+    #     day_from,
+    #     year_to,
+    #     month_to,
+    #     day_to,
+    #     domains_to_skip,
+    # )
+    formatted_tokens = html_highlight(
         plag_option,
         input,
         year_from,
     with gr.Row():
         with gr.Column():
+            sentenceBreakdown = gr.HTML(
                 label="Source Detection Sentence Breakdown",
+                value="Source Detection Sentence Breakdown",
             )
     with gr.Row():
         with gr.Column():
             writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
+        with gr.Column():
+            interpretation = """
+<h2>Writing Analysis Interpretation</h2>
+<ul>
+    <li><b>Lexical Diversity</b>: This feature measures the range of unique words used in a text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Vocabulary Level</b>: This feature assesses the complexity of the words used in a text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Unique Words</b>: This feature counts the number of words that appear only once within the text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Determiner Use</b>: This feature tracks the frequency of articles and quantifiers in the text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Punctuation Variety</b>: This feature indicates the diversity of punctuation marks used in the text.
+        <ul>
+            <li>👤 Higher tends to be Human.</li>
+        </ul>
+    </li>
+    <li><b>Sentence Depth</b>: This feature evaluates the complexity of the sentence structures used in the text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Vocabulary Stability</b>: This feature measures the consistency of vocabulary use throughout the text.
+        <ul>
+            <li>🤖 Higher tends to be AI.</li>
+        </ul>
+    </li>
+    <li><b>Entity Ratio</b>: This feature calculates the proportion of named entities, such as names and places, within the text.
+        <ul>
+            <li>👤 Higher tends to be Human.</li>
+        </ul>
+    </li>
+    <li><b>Perplexity</b>: This feature assesses the predictability of the text based on the sequence of words.
+        <ul>
+            <li>👤 Higher tends to be Human.</li>
+        </ul>
+    </li>
+</ul>
+"""
+            gr.HTML(interpretation, label="Interpretation of Writing Analysis")
     full_check_btn.click(
         fn=main,
     )
     only_plagiarism_btn.click(
+        # fn=plagiarism_check,
+        fn=html_highlight,
         inputs=[
             plag_option,
             input_text,

explainability.py DELETED Viewed

@@ -1,119 +0,0 @@
-import re, textstat
-from nltk import FreqDist
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize, sent_tokenize
-import torch
-import nltk
-from tqdm import tqdm
-nltk.download("punkt")
-def normalize(value, min_value, max_value):
-    normalized_value = ((value - min_value) * 100) / (max_value - min_value)
-    return max(0, min(100, normalized_value))
-def preprocess_text1(text):
-    text = text.lower()
-    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
-    stop_words = set(stopwords.words("english"))  # remove stopwords
-    words = [word for word in text.split() if word not in stop_words]
-    words = [word for word in words if not word.isdigit()]  # remove numbers
-    return words
-def vocabulary_richness_ttr(words):
-    unique_words = set(words)
-    ttr = len(unique_words) / len(words) * 100
-    return ttr
-def calculate_gunning_fog(text):
-    """range 0-20"""
-    gunning_fog = textstat.gunning_fog(text)
-    return gunning_fog
-def calculate_automated_readability_index(text):
-    """range 1-20"""
-    ari = textstat.automated_readability_index(text)
-    return ari
-def calculate_flesch_reading_ease(text):
-    """range 0-100"""
-    fre = textstat.flesch_reading_ease(text)
-    return fre
-def preprocess_text2(text):
-    sentences = sent_tokenize(text)
-    words = [
-        word.lower()
-        for sent in sentences
-        for word in word_tokenize(sent)
-        if word.isalnum()
-    ]
-    stop_words = set(stopwords.words("english"))
-    words = [word for word in words if word not in stop_words]
-    return words, sentences
-def calculate_average_sentence_length(sentences):
-    """range 0-40 or 50 based on the histogram"""
-    total_words = sum(len(word_tokenize(sent)) for sent in sentences)
-    average_sentence_length = total_words / (len(sentences) + 0.0000001)
-    return average_sentence_length
-def calculate_average_word_length(words):
-    """range 0-8 based on the histogram"""
-    total_characters = sum(len(word) for word in words)
-    average_word_length = total_characters / (len(words) + 0.0000001)
-    return average_word_length
-def calculate_max_depth(sent):
-    return max(len(list(token.ancestors)) for token in sent)
-def calculate_syntactic_tree_depth(nlp, text):
-    """0-10 based on the histogram"""
-    doc = nlp(text)
-    sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
-    average_depth = (
-        sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
-    )
-    return average_depth
-def calculate_perplexity(text, model, tokenizer, device, stride=512):
-    """range 0-30 based on the histogram"""
-    encodings = tokenizer(text, return_tensors="pt")
-    max_length = model.config.n_positions
-    seq_len = encodings.input_ids.size(1)
-    nlls = []
-    prev_end_loc = 0
-    for begin_loc in tqdm(range(0, seq_len, stride)):
-        end_loc = min(begin_loc + max_length, seq_len)
-        trg_len = (
-            end_loc - prev_end_loc
-        )  # may be different from stride on last loop
-        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-        with torch.no_grad():
-            outputs = model(input_ids, labels=target_ids)
-            neg_log_likelihood = outputs.loss
-        nlls.append(neg_log_likelihood)
-        prev_end_loc = end_loc
-        if end_loc == seq_len:
-            break
-    ppl = torch.exp(torch.stack(nlls).mean())
-    return ppl.item()

plagiarism.py CHANGED Viewed

@@ -19,7 +19,6 @@ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 # returns cosine similarity of two vectors
 # input: two vectors
 # output: integer between 0 and 1.
 def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
@@ -75,9 +74,9 @@ def sentence_similarity(text1, text2):
 def google_search(
     plag_option,
     sentences,
-    urlCount,
-    scoreArray,
-    urlList,
     sorted_date,
     domains_to_skip,
     api_key,
@@ -112,30 +111,30 @@ def google_search(
                 # update cosine similarity between snippet and given text
                 url = link["link"]
-                if url not in urlList:
-                    urlList.append(url)
-                    scoreArray.append([0] * len(sentences))
-                urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
                 if plag_option == "Standard":
-                    scoreArray[urlList.index(url)][i] = cosineSim(
                         sentence, snippet
                     )
                 else:
-                    scoreArray[urlList.index(url)][i] = sentence_similarity(
                         sentence, snippet
                     )
-    return urlCount, scoreArray
 def split_sentence_blocks(text):
-    sents = sent_tokenize(text)
     two_sents = []
-    for i in range(len(sents)):
-        if (i % 2) == 0:
-            two_sents.append(sents[i])
-        else:
-            two_sents[len(two_sents) - 1] += " " + sents[i]
     return two_sents
@@ -191,7 +190,6 @@ async def parallel_scrap(urls):
     return results
 def matching_score(sentence_content_tuple):
     sentence, content = sentence_content_tuple
     if sentence in content:
@@ -204,11 +202,99 @@ def matching_score(sentence_content_tuple):
         matched = [x for x in ngrams if " ".join(x) in content]
         return len(matched) / len(ngrams)
 def process_with_multiprocessing(input_data):
     with Pool(processes=4) as pool:
         scores = pool.map(matching_score, input_data)
     return scores
 def plagiarism_check(
     plag_option,
     input,
@@ -222,116 +308,79 @@ def plagiarism_check(
 ):
     api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
-    api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
     # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
-    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
     sentences = split_sentence_blocks(input)
-    urlCount = {}
-    ScoreArray = []
-    urlList = []
     date_from = build_date(year_from, month_from, day_from)
     date_to = build_date(year_to, month_to, day_to)
     sort_date = f"date:r:{date_from}:{date_to}"
     # get list of URLS to check
-    urlCount, ScoreArray = google_search(
         plag_option,
         sentences,
-        urlCount,
-        ScoreArray,
-        urlList,
         sort_date,
         domains_to_skip,
         api_key,
         cse_id,
     )
     # Scrape URLs in list
-    formatted_tokens = []
-    soups = asyncio.run(parallel_scrap(urlList))
-    # # Populate matching scores for scrapped pages
-    # for i, soup in enumerate(soups):
-    #     print(f"Analyzing {i+1} of {len(soups)} soups........................")
-    #     if soup:
-    #         page_content = soup.text
-    #         for j, sent in enumerate(sentences):
-    #             args_list = (sent, page_content)
-    #             score = matching_score(args_list)
-    #             # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
-    #             ScoreArray[i][j] = score
     input_data = []
     for i, soup in enumerate(soups):
         if soup:
             page_content = soup.text
             for j, sent in enumerate(sentences):
                 input_data.append((sent, page_content))
     scores = process_with_multiprocessing(input_data)
-    k = 0
     for i, soup in enumerate(soups):
         if soup:
             for j, _ in enumerate(sentences):
-                ScoreArray[i][j] = scores[k]
-                k += 1
-    sentenceToMaxURL = [-1] * len(sentences)
-    for j in range(len(sentences)):
-        if j > 0:
-            maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
-            sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
-        else:
-            maxScore = -1
-        for i in range(len(ScoreArray)):
-            margin = (
-                0.1
-                if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
-                else 0
-            )
-            if ScoreArray[i][j] - maxScore > margin:
-                maxScore = ScoreArray[i][j]
-                sentenceToMaxURL[j] = i
     index = np.unique(sentenceToMaxURL)
-    urlScore = {}
     for url in index:
         s = [
-            ScoreArray[url][sen]
             for sen in range(len(sentences))
             if sentenceToMaxURL[sen] == url
         ]
-        urlScore[url] = sum(s) / len(s)
-    index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
     urlMap = {}
     for count, i in enumerate(index_descending):
         urlMap[i] = count + 1
     for i, sent in enumerate(sentences):
-        formatted_tokens.append(
-            (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
-        )
-    formatted_tokens.append(("\n", None))
-    formatted_tokens.append(("\n", None))
-    formatted_tokens.append(("\n", None))
     for ind in index_descending:
-        formatted_tokens.append(
-            (
-                urlList[ind]
-                + " --- Matching Score: "
-                + f"{str(round(urlScore[ind] * 100, 2))}%",
-                "[" + str(urlMap[ind]) + "]",
             )
-        )
-        formatted_tokens.append(("\n", None))
-    return formatted_tokens

 # returns cosine similarity of two vectors
 # input: two vectors
 # output: integer between 0 and 1.
 def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
 def google_search(
     plag_option,
     sentences,
+    url_count,
+    score_array,
+    url_list,
     sorted_date,
     domains_to_skip,
     api_key,
                 # update cosine similarity between snippet and given text
                 url = link["link"]
+                if url not in url_list:
+                    url_list.append(url)
+                    score_array.append([0] * len(sentences))
+                url_count[url] = url_count[url] + 1 if url in url_count else 1
                 if plag_option == "Standard":
+                    score_array[url_list.index(url)][i] = cosineSim(
                         sentence, snippet
                     )
                 else:
+                    score_array[url_list.index(url)][i] = sentence_similarity(
                         sentence, snippet
                     )
+    return url_count, score_array
 def split_sentence_blocks(text):
     two_sents = []
+    for para in text.split("\n\n"):
+        sents = sent_tokenize(para)
+        for i in range(len(sents)):
+            if (i % 2) == 0:
+                two_sents.append(sents[i])
+            else:
+                two_sents[len(two_sents) - 1] += " " + sents[i]
     return two_sents
     return results
 def matching_score(sentence_content_tuple):
     sentence, content = sentence_content_tuple
     if sentence in content:
         matched = [x for x in ngrams if " ".join(x) in content]
         return len(matched) / len(ngrams)
 def process_with_multiprocessing(input_data):
     with Pool(processes=4) as pool:
         scores = pool.map(matching_score, input_data)
     return scores
+def print2d(array):
+    for row in array:
+        print(row)
+def map_sentence_url(sentences, score_array):
+    sentenceToMaxURL = [-1] * len(sentences)
+    for j in range(len(sentences)):
+        if j > 0:
+            maxScore = score_array[sentenceToMaxURL[j - 1]][j]
+            sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
+        else:
+            maxScore = -1
+        for i in range(len(score_array)):
+            margin = (
+                0.05
+                if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
+                else 0
+            )
+            if score_array[i][j] - maxScore > margin:
+                maxScore = score_array[i][j]
+                sentenceToMaxURL[j] = i
+    return sentenceToMaxURL
+def html_highlight(
+    plag_option,
+    input,
+    year_from,
+    month_from,
+    day_from,
+    year_to,
+    month_to,
+    day_to,
+    domains_to_skip,
+):
+    sentence_scores, url_scores = plagiarism_check(
+        plag_option,
+        input,
+        year_from,
+        month_from,
+        day_from,
+        year_to,
+        month_to,
+        day_to,
+        domains_to_skip,
+    )
+    color_map = [
+        "#cf2323",
+        "#eb9d59",
+        "#c2ad36",
+        "#e1ed72",
+        "#c2db76",
+        "#a2db76",
+    ]
+    font = "Roboto"
+    html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
+    prev_idx = None
+    combined_sentence = ""
+    for sentence, _, _, idx in sentence_scores:
+        if idx != prev_idx and prev_idx is not None:
+            color = color_map[prev_idx - 1]
+            index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
+            formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
+            html_content += formatted_sentence
+            combined_sentence = ""
+        combined_sentence += " " + sentence
+        prev_idx = idx
+    if combined_sentence:
+        color = color_map[prev_idx - 1]
+        index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
+        formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
+        html_content += formatted_sentence
+    html_content += "<hr>"
+    for url, score, idx in url_scores:
+        color = color_map[idx - 1]
+        formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
+        html_content += formatted_url
+    html_content += "</div>"
+    return html_content
 def plagiarism_check(
     plag_option,
     input,
 ):
     api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
+    # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
     # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
+    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
+    url_scores = []
+    sentence_scores = []
     sentences = split_sentence_blocks(input)
+    url_count = {}
+    score_array = []
+    url_list = []
     date_from = build_date(year_from, month_from, day_from)
     date_to = build_date(year_to, month_to, day_to)
     sort_date = f"date:r:{date_from}:{date_to}"
     # get list of URLS to check
+    url_count, score_array = google_search(
         plag_option,
         sentences,
+        url_count,
+        score_array,
+        url_list,
         sort_date,
         domains_to_skip,
         api_key,
         cse_id,
     )
     # Scrape URLs in list
+    soups = asyncio.run(parallel_scrap(url_list))
     input_data = []
     for i, soup in enumerate(soups):
         if soup:
             page_content = soup.text
             for j, sent in enumerate(sentences):
                 input_data.append((sent, page_content))
     scores = process_with_multiprocessing(input_data)
+    k = 0
+    # Update score array for each (soup, sentence)
     for i, soup in enumerate(soups):
         if soup:
             for j, _ in enumerate(sentences):
+                score_array[i][j] = scores[k]
+                k += 1
+    sentenceToMaxURL = map_sentence_url(sentences, score_array)
     index = np.unique(sentenceToMaxURL)
+    url_source = {}
     for url in index:
         s = [
+            score_array[url][sen]
             for sen in range(len(sentences))
             if sentenceToMaxURL[sen] == url
         ]
+        url_source[url] = sum(s) / len(s)
+    index_descending = sorted(url_source, key=url_source.get, reverse=True)
     urlMap = {}
     for count, i in enumerate(index_descending):
         urlMap[i] = count + 1
+    # build results
     for i, sent in enumerate(sentences):
+        ind = sentenceToMaxURL[i]
+        if url_source[ind] > 0.1:
+            sentence_scores.append(
+                [sent, url_source[ind], url_list[ind], urlMap[ind]]
+            )
+        else:
+            sentence_scores.append([sent, None, url_list[ind], -1])
     for ind in index_descending:
+        if url_source[ind] > 0.1:
+            url_scores.append(
+                [url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
             )
+    return sentence_scores, url_scores

requirements.txt CHANGED Viewed

@@ -28,4 +28,7 @@ lime
 joblib
 optimum
 clean-text
-optimum[onnxruntime]

 joblib
 optimum
 clean-text
+optimum[onnxruntime]
+emoji==1.6.1
+matplotlib
+seaborn

writing_analysis.py CHANGED Viewed

@@ -1,85 +1,153 @@
-import re, textstat
-from nltk import FreqDist
 from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize, sent_tokenize
 import torch
-from tqdm import tqdm
-def normalize(value, min_value, max_value):
-    normalized_value = ((value - min_value) * 100) / (max_value - min_value)
-    return max(0, min(100, normalized_value))
-# vocabulary richness
-def preprocess_text1(text):
-    text = text.lower()
-    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
-    stop_words = set(stopwords.words('english')) # remove stopwords
-    words = [word for word in text.split() if word not in stop_words]
-    words = [word for word in words if not word.isdigit()] # remove numbers
-    return words
-def vocabulary_richness_ttr(words):
-    unique_words = set(words)
-    ttr = len(unique_words) / len(words) * 100
-    return ttr
-def calculate_gunning_fog(text):
-    """range 0-20"""
-    gunning_fog = textstat.gunning_fog(text)
-    return gunning_fog
-def calculate_automated_readability_index(text):
-    """range 1-20"""
-    ari = textstat.automated_readability_index(text)
-    return ari
-def calculate_flesch_reading_ease(text):
-    """range 0-100"""
-    fre = textstat.flesch_reading_ease(text)
-    return fre
-def preprocess_text2(text):
-    # tokenize into words and remove punctuation
-    sentences = sent_tokenize(text)
-    words = [word.lower() for sent in sentences for word in word_tokenize(sent) if word.isalnum()]
-    # remove stopwords
-    stop_words = set(stopwords.words('english'))
-    words = [word for word in words if word not in stop_words]
-    return words, sentences
-def calculate_average_sentence_length(sentences):
-    """range 0-40 or 50 based on the histogram"""
-    total_words = sum(len(word_tokenize(sent)) for sent in sentences)
-    average_sentence_length = total_words / (len(sentences) + 0.0000001)
-    return average_sentence_length
-def calculate_average_word_length(words):
-    """range 0-8 based on the histogram"""
-    total_characters = sum(len(word) for word in words)
-    average_word_length = total_characters / (len(words) + 0.0000001)
-    return average_word_length
 def calculate_max_depth(sent):
     return max(len(list(token.ancestors)) for token in sent)
-def calculate_syntactic_tree_depth(nlp, text):
-    """0-10 based on the histogram"""
     doc = nlp(text)
     sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
-    average_depth = sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
     return average_depth
-# reference: https://huggingface.co/docs/transformers/perplexity
 def calculate_perplexity(text, model, tokenizer, device, stride=512):
-    """range 0-30 based on the histogram"""
     encodings = tokenizer(text, return_tensors="pt")
     max_length = model.config.n_positions
     seq_len = encodings.input_ids.size(1)
     nlls = []
     prev_end_loc = 0
-    for begin_loc in tqdm(range(0, seq_len, stride)):
         end_loc = min(begin_loc + max_length, seq_len)
         trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
         input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
@@ -88,6 +156,10 @@ def calculate_perplexity(text, model, tokenizer, device, stride=512):
         with torch.no_grad():
             outputs = model(input_ids, labels=target_ids)
             neg_log_likelihood = outputs.loss
         nlls.append(neg_log_likelihood)
@@ -98,3 +170,4 @@ def calculate_perplexity(text, model, tokenizer, device, stride=512):
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()

+import string
+from collections import Counter
+from nltk import word_tokenize
 from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from nltk.probability import FreqDist
 import torch
+def preprocess_text(text, remove_stopwords=True, use_lemmatization=True):
+    tokens = word_tokenize(text.lower())
+    tokens = [token for token in tokens if token.isalpha()]
+    if remove_stopwords:
+        stop_words = set(stopwords.words("english"))
+        tokens = [token for token in tokens if token not in stop_words]
+    if use_lemmatization:
+        lemmatizer = WordNetLemmatizer()
+        tokens = [lemmatizer.lemmatize(token) for token in tokens]
+    return tokens
+def get_special_chars():
+    import emoji  # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
+    main_special_characters = string.punctuation + string.digits + string.whitespace
+    other_special_characters = (
+        "    　    ’“”–ー一▬…✦�£•€«»°·═"
+        "×士＾˘⇓↓↑←→（）§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
+        "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
+        "゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬？▷Г♫∟™ª₪®「—❖"
+        "」﴾》"
+    )
+    emoji = list(emoji.UNICODE_EMOJI["en"].keys())
+    special_characters_default = set(main_special_characters + other_special_characters)
+    special_characters_default.update(emoji)
+    return special_characters_default
+special_characters_default = get_special_chars()
+# -------------------- Features --------------------
+def syllable_count(word, d):
+    return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])]
+def estimated_slightly_difficult_words_ratio(text, d):
+    words = word_tokenize(text.lower())
+    total_words = len(words)
+    # Considering words with 3 or more syllables as difficult
+    difficult_count = sum(
+        1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2
+    )
+    return difficult_count / total_words if total_words > 0 else 0
+# -------------------- Features --------------------
+def entity_density(text, nlp):
+    doc = nlp(text)
+    return len(doc.ents) / len(doc)
+# -------------------- Features --------------------
+def determiners_frequency(text, nlp):
+    doc = nlp(text)
+    determiners = sum(1 for token in doc if token.pos_ == "DET")
+    total_words = len(doc)
+    return determiners / total_words if total_words else 0
+# -------------------- Features --------------------
+def punctuation_diversity(text):
+    punctuation_counts = Counter(
+        char for char in text if char in special_characters_default
+    )
+    diversity_score = (
+        len(punctuation_counts) / len(special_characters_default)
+        if special_characters_default
+        else 0
+    )
+    return diversity_score
+# -------------------- Features --------------------
+def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True):
+    tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
+    unique_words = set(tokens)
+    return len(unique_words) / len(tokens) if tokens else 0
+# -------------------- Features --------------------
+def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True):
+    tokens = word_tokenize(text.lower())
+    tokens = [token for token in tokens if token.isalpha()]
+    if remove_stopwords:
+        stop_words = set(stopwords.words("english"))
+        tokens = [token for token in tokens if token not in stop_words]
+    if use_lemmatization:
+        lemmatizer = WordNetLemmatizer()
+        tokens = [lemmatizer.lemmatize(token) for token in tokens]
+    freq_dist = FreqDist(tokens)
+    hapaxes = freq_dist.hapaxes()
+    return len(hapaxes) / len(tokens) if tokens else 0
+# -------------------- Features --------------------
+def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True):
+    tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
+    def mtld_calc(direction):
+        token_length, factor_count = 0, 0
+        types = set()
+        for token in tokens if direction == "forward" else reversed(tokens):
+            types.add(token)
+            token_length += 1
+            if len(types) / token_length < threshold:
+                factor_count += 1
+                types = set()
+                token_length = 0
+        factor_count += 1  # For the last segment, even if it didn't reach the threshold
+        return len(tokens) / factor_count if factor_count != 0 else 0
+    return (mtld_calc("forward") + mtld_calc("backward")) / 2
+# -------------------- Features --------------------
 def calculate_max_depth(sent):
     return max(len(list(token.ancestors)) for token in sent)
+def calculate_syntactic_tree_depth(text, nlp):
     doc = nlp(text)
     sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
+    average_depth = (
+        sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
+    )
     return average_depth
+# -------------------- Features --------------------
 def calculate_perplexity(text, model, tokenizer, device, stride=512):
     encodings = tokenizer(text, return_tensors="pt")
     max_length = model.config.n_positions
     seq_len = encodings.input_ids.size(1)
     nlls = []
     prev_end_loc = 0
+    for begin_loc in range(0, seq_len, stride):
         end_loc = min(begin_loc + max_length, seq_len)
         trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
         input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
         with torch.no_grad():
             outputs = model(input_ids, labels=target_ids)
+            # loss is calculated using CrossEntropyLoss which averages over valid labels
+            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+            # to the left by 1.
             neg_log_likelihood = outputs.loss
         nlls.append(neg_log_likelihood)
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()