Spaces:

BheemaShankerNeyigapula
/

aiisc-watermarking-model

Runtime error

App Files Files Community

BheemaShankerNeyigapula commited on Nov 4, 2024

Commit

ea6afa4

verified ·

1 Parent(s): 00dafdf

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitignore +2 -0
.gradio/certificate.pem +31 -0
README.md +4 -8
app.py +169 -0
detectability.py +303 -0
distortion.py +126 -0
entailment.py +33 -0
euclidean_distance.py +74 -0
gpt_mask_filling.py +70 -0
highlighter.py +92 -0
lcs.py +63 -0
masking_methods.py +137 -0
masking_methods_trial.py +188 -0
paraphraser.py +45 -0
requirements.txt +21 -0
sampling_methods.py +35 -0
scores.py +51 -0
threeD_plot.py +69 -0
tree.py +240 -0
vocabulary_split.py +56 -0
watermark_detector.py +75 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ __pycache__/

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

@@ -1,12 +1,8 @@
 ---
-title: Aiisc Watermarking Model
-emoji: 🚀
-colorFrom: blue
-colorTo: gray
-sdk: gradio
-sdk_version: 5.4.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: aiisc-watermarking-model
 app_file: app.py
+sdk: gradio
+sdk_version: 4.36.0
 ---
+Clone the repository and ``cd`` into it. Run ``gradio app.py`` to start the server.

app.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import nltk
+nltk.download('stopwords')
+import plotly.graph_objs as go
+from transformers import pipeline
+import random
+import gradio as gr
+from tree import generate_subplot1, generate_subplot2
+from paraphraser import generate_paraphrase
+from lcs import find_common_subsequences, find_common_gram_positions
+from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
+from entailment import analyze_entailment
+from masking_methods import mask_non_stopword, high_entropy_words
+from sampling_methods import sample_word
+from detectability import SentenceDetectabilityCalculator
+from distortion import SentenceDistortionCalculator
+from euclidean_distance import SentenceEuclideanDistanceCalculator
+from threeD_plot import gen_three_D_plot
+# Function for the Gradio interface
+def model(prompt):
+    user_prompt = prompt
+    paraphrased_sentences = generate_paraphrase(user_prompt)
+    analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(user_prompt, paraphrased_sentences, 0.7)
+    common_grams = find_common_subsequences(user_prompt, selected_sentences)
+    subsequences = [subseq for _, subseq in common_grams]
+    common_grams_position = find_common_gram_positions(selected_sentences, subsequences)
+    # Create masked results using a single loop
+    masked_results = []
+    for sentence in paraphrased_sentences:
+        masked_results.extend([
+            (mask_non_stopword, sentence),
+            (mask_non_stopword, sentence, True),
+            (high_entropy_words, sentence, common_grams)
+        ])
+    # Process masking functions and unpack results
+    masked_outputs = [
+        (func(sent) if len(result) == 2 else func(sent, extra))
+        for func, sent, *extra in masked_results
+        for result in [func(sent, *extra)]
+    ]
+    # Unpack masked outputs into separate lists
+    masked_sentences, masked_words, masked_logits = zip(*masked_outputs) if masked_outputs else ([], [], [])
+    sampled_sentences = []
+    for masked_sent, words, logits in zip(masked_sentences, masked_words, masked_logits):
+        for technique in ['inverse_transform', 'exponential_minimum', 'temperature', 'greedy']:
+            sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique=technique, temperature=1.0))
+    colors = ["red", "blue", "brown", "green"]
+    def select_color():
+        return random.choice(colors)
+    highlight_info = [(word, select_color()) for _, word in common_grams]
+    highlighted_user_prompt = highlight_common_words(common_grams, [user_prompt], "Non-melting Points in the User Prompt")
+    highlighted_accepted_sentences = highlight_common_words_dict(common_grams, selected_sentences, "Paraphrased Sentences")
+    highlighted_discarded_sentences = highlight_common_words_dict(common_grams, discarded_sentences, "Discarded Sentences")
+    trees1, trees2 = [], []
+    for i, sentence in enumerate(paraphrased_sentences):
+        next_masked_sentences = masked_sentences[i * 3:(i + 1) * 3]
+        next_sampled_sentences = sampled_sentences[i * 12:(i + 1) * 12]
+        tree1 = generate_subplot1(sentence, next_masked_sentences, highlight_info, common_grams)
+        trees1.append(tree1)
+        tree2 = generate_subplot2(next_masked_sentences, next_sampled_sentences, highlight_info, common_grams)
+        trees2.append(tree2)
+    reparaphrased_sentences = generate_paraphrase(sampled_sentences)
+    # Process the sentences in batches of 10
+    reparaphrased_sentences_list = []
+    for i in range(0, len(reparaphrased_sentences), 10):
+        batch = reparaphrased_sentences[i:i + 10]
+        if len(batch) == 10:
+            html_block = reparaphrased_sentences_html(batch)
+            reparaphrased_sentences_list.append(html_block)
+    # Calculate metrics
+    distortion_calculator = SentenceDistortionCalculator(user_prompt, reparaphrased_sentences)
+    distortion_calculator.calculate_all_metrics()
+    distortion_calculator.normalize_metrics()
+    distortion = distortion_calculator.get_combined_distortions()
+    distortion_list = list(distortion.values())
+    detectability_calculator = SentenceDetectabilityCalculator(user_prompt, reparaphrased_sentences)
+    detectability_calculator.calculate_all_metrics()
+    detectability_calculator.normalize_metrics()
+    detectability = detectability_calculator.get_combined_detectabilities()
+    detectability_list = list(detectability.values())
+    euclidean_dist_calculator = SentenceEuclideanDistanceCalculator(user_prompt, reparaphrased_sentences)
+    euclidean_dist_calculator.calculate_all_metrics()
+    euclidean_dist_calculator.normalize_metrics()
+    euclidean_dist = euclidean_dist_calculator.get_normalized_metrics()
+    euclidean_dist_list = list(euclidean_dist.values())
+    three_D_plot = gen_three_D_plot(detectability_list, distortion_list, euclidean_dist_list)
+    return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees1 + trees2 + reparaphrased_sentences_list + [three_D_plot]
+# Gradio Interface
+with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
+    gr.Markdown("# **AIISC Watermarking Model**")
+    with gr.Row():
+        user_input = gr.Textbox(label="User Prompt")
+    with gr.Row():
+        submit_button = gr.Button("Submit")
+        clear_button = gr.Button("Clear")
+    with gr.Row():
+        highlighted_user_prompt = gr.HTML()
+    with gr.Row():
+        with gr.Tabs():
+            with gr.TabItem("Paraphrased Sentences"):
+                highlighted_accepted_sentences = gr.HTML()
+            with gr.TabItem("Discarded Sentences"):
+                highlighted_discarded_sentences = gr.HTML()
+    with gr.Row():
+        gr.Markdown("### Where to Watermark?")  # Label for masked sentences trees
+    with gr.Row():
+        with gr.Tabs():
+            tree1_tabs = [gr.Plot() for _ in range(10)]  # Adjust this range according to the number of trees
+            for i, tree1 in enumerate(tree1_tabs):
+                with gr.TabItem(f"Sentence {i + 1}"):
+                    pass  # Placeholder for each tree plot
+    with gr.Row():
+        gr.Markdown("### How to Watermark?")  # Label for sampled sentences trees
+    with gr.Row():
+        with gr.Tabs():
+            tree2_tabs = [gr.Plot() for _ in range(10)]  # Adjust this range according to the number of trees
+            for i, tree2 in enumerate(tree2_tabs):
+                with gr.TabItem(f"Sentence {i + 1}"):
+                    pass  # Placeholder for each tree plot
+    with gr.Row():
+        gr.Markdown("### Re-paraphrased Sentences")  # Label for re-paraphrased sentences
+    with gr.Row():
+        with gr.Tabs():
+            reparaphrased_sentences_tabs = [gr.HTML() for _ in range(120)]  # 120 tabs for 120 batches of sentences
+            for i, reparaphrased_sent_html in enumerate(reparaphrased_sentences_tabs):
+                with gr.TabItem(f"Sentence {i + 1}"):
+                    pass  # Placeholder for each batch
+    with gr.Row():
+        gr.Markdown("### 3D Plot for Sweet Spot")
+    with gr.Row():
+        three_D_plot = gr.Plot()
+    submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
+    clear_button.click(lambda: "", inputs=None, outputs=user_input)
+    clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
+demo.launch(share=True)

detectability.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# Import necessary libraries
+import nltk
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import BertModel, BertTokenizer
+from sentence_transformers import SentenceTransformer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+# Download NLTK data if not already present
+nltk.download('punkt', quiet=True)
+class SentenceDetectabilityCalculator:
+    """
+    A class to calculate and analyze detectability metrics between an original sentence and paraphrased sentences.
+    """
+    def __init__(self, original_sentence, paraphrased_sentences):
+        """
+        Initialize the calculator with the original sentence and a list of paraphrased sentences.
+        """
+        self.original_sentence = original_sentence
+        self.paraphrased_sentences = paraphrased_sentences
+        self.metrics = {
+            'BLEU Score': {},
+            'Cosine Similarity': {},
+            'STS Score': {}
+        }
+        self.normalized_metrics = {
+            'BLEU Score': {},
+            'Cosine Similarity': {},
+            'STS Score': {}
+        }
+        self.combined_detectabilities = {}
+        # Load pre-trained models
+        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
+        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        self.sts_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
+        # Calculate original embeddings
+        self.original_embedding = self._get_sentence_embedding(self.original_sentence)
+        self.sts_original_embedding = self.sts_model.encode(self.original_sentence)
+    def calculate_all_metrics(self):
+        """
+        Calculate all detectability metrics for each paraphrased sentence.
+        """
+        for idx, paraphrased_sentence in enumerate(self.paraphrased_sentences):
+            key = f"Sentence_{idx + 1}"
+            self.metrics['BLEU Score'][key] = self._calculate_bleu(self.original_sentence, paraphrased_sentence)
+            paraphrase_embedding = self._get_sentence_embedding(paraphrased_sentence)
+            self.metrics['Cosine Similarity'][key] = cosine_similarity([self.original_embedding], [paraphrase_embedding])[0][0]
+            sts_paraphrase_embedding = self.sts_model.encode(paraphrased_sentence)
+            self.metrics['STS Score'][key] = cosine_similarity([self.sts_original_embedding], [sts_paraphrase_embedding])[0][0]
+    def normalize_metrics(self):
+        """
+        Normalize all metrics to be between 0 and 1.
+        """
+        for metric_name, metric_dict in self.metrics.items():
+            self.normalized_metrics[metric_name] = self._normalize_dict(metric_dict)
+    def calculate_combined_detectability(self):
+        """
+        Calculate the combined detectability using the root mean square of the normalized metrics.
+        """
+        for key in self.normalized_metrics['BLEU Score'].keys():
+            rms = np.sqrt(sum(
+                self.normalized_metrics[metric][key] ** 2 for metric in self.normalized_metrics
+            ) / len(self.normalized_metrics))
+            self.combined_detectabilities[key] = rms
+    def plot_metrics(self):
+        """
+        Plot each normalized metric and the combined detectability in separate graphs.
+        """
+        keys = list(self.normalized_metrics['BLEU Score'].keys())
+        indices = np.arange(len(keys))
+        # Prepare data for plotting
+        metrics = {name: [self.normalized_metrics[name][key] for key in keys] for name in self.normalized_metrics}
+        # Plot each metric separately
+        for metric_name, values in metrics.items():
+            plt.figure(figsize=(12, 6))
+            plt.plot(indices, values, marker='o', color=np.random.rand(3,))
+            plt.xlabel('Sentence Index')
+            plt.ylabel('Normalized Value (0-1)')
+            plt.title(f'Normalized {metric_name}')
+            plt.grid(True)
+            plt.tight_layout()
+            plt.show()
+    # Private methods for metric calculations
+    def _calculate_bleu(self, reference, candidate):
+        """
+        Calculate the BLEU score between the original and paraphrased sentence using smoothing.
+        """
+        reference_tokens = nltk.word_tokenize(reference)
+        candidate_tokens = nltk.word_tokenize(candidate)
+        smoothing = SmoothingFunction().method1
+        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing)
+    def _get_sentence_embedding(self, sentence):
+        """
+        Get sentence embedding using BERT.
+        """
+        tokens = self.bert_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = self.bert_model(**tokens)
+        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+    def _normalize_dict(self, metric_dict):
+        """
+        Normalize the values in a dictionary to be between 0 and 1.
+        """
+        values = np.array(list(metric_dict.values()))
+        min_val = values.min()
+        max_val = values.max()
+        # Avoid division by zero if all values are the same
+        return dict(zip(metric_dict.keys(), np.zeros_like(values) if max_val - min_val == 0 else (values - min_val) / (max_val - min_val)))
+    # Getter methods
+    def get_normalized_metrics(self):
+        """
+        Get all normalized metrics as a dictionary.
+        """
+        return self.normalized_metrics
+    def get_combined_detectabilities(self):
+        """
+        Get the dictionary of combined detectability values.
+        """
+        return self.combined_detectabilities
+# Example usage
+if __name__ == "__main__":
+    # Original sentence
+    original_sentence = "The quick brown fox jumps over the lazy dog"
+    # Paraphrased sentences
+    paraphrased_sentences = [
+    # Original 1: "A swift auburn fox leaps across a sleepy canine."
+    "The swift auburn fox leaps across a sleepy canine.",
+    "A quick auburn fox leaps across a sleepy canine.",
+    "A swift ginger fox leaps across a sleepy canine.",
+    "A swift auburn fox bounds across a sleepy canine.",
+    "A swift auburn fox leaps across a tired canine.",
+    "Three swift auburn foxes leap across a sleepy canine.",
+    "The vulpine specimen rapidly traverses over a dormant dog.",
+    "Like lightning, the russet hunter soars over the drowsy guardian.",
+    "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
+    "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
+    "A swift auburn predator navigates across a lethargic pet.",
+    "Subject A (fox) demonstrates velocity over Subject B (dog).",
+    # Original 2: "The agile russet fox bounds over an idle hound."
+    "Some agile russet foxes bound over an idle hound.",
+    "The nimble russet fox bounds over an idle hound.",
+    "The agile brown fox bounds over an idle hound.",
+    "The agile russet fox jumps over an idle hound.",
+    "The agile russet fox bounds over a lazy hound.",
+    "Two agile russet foxes bound over an idle hound.",
+    "A dexterous vulpine surpasses a stationary canine.",
+    "Quick as thought, the copper warrior sails over the guardian.",
+    "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
+    "A dexterous V. vulpes exceeds the plane of an inactive canine.",
+    "An agile russet hunter maneuvers above a resting hound.",
+    "Test subject F-1 achieves displacement superior to subject D-1.",
+    # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
+    "The nimble mahogany vulpine vaults above a drowsy dog.",
+    "A swift mahogany vulpine vaults above a drowsy dog.",
+    "A nimble reddish vulpine vaults above a drowsy dog.",
+    "A nimble mahogany fox vaults above a drowsy dog.",
+    "A nimble mahogany vulpine leaps above a drowsy dog.",
+    "Four nimble mahogany vulpines vault above a drowsy dog.",
+    "An agile specimen of reddish fur surpasses a somnolent canine.",
+    "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
+    "Tha quick brown beastie jumps o'er the tired pup, aye.",
+    "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
+    "A nimble rust-colored predator crosses above a drowsy pet.",
+    "Observed: Subject Red executes vertical motion over Subject Gray.",
+    # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
+    "A speedy copper-colored fox hops over the lethargic pup.",
+    "The quick copper-colored fox hops over the lethargic pup.",
+    "The speedy bronze fox hops over the lethargic pup.",
+    "The speedy copper-colored fox jumps over the lethargic pup.",
+    "The speedy copper-colored fox hops over the tired pup.",
+    "Multiple speedy copper-colored foxes hop over the lethargic pup.",
+    "A rapid vulpine of bronze hue traverses an inactive young canine.",
+    "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
+    "Tha fast copper beastie leaps o'er the sleepy wee dog.",
+    "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
+    "A fleet copper-toned predator moves past a sluggish young dog.",
+    "Field note: Adult fox subject exceeds puppy subject vertically.",
+    # Original 5: "A rapid tawny fox springs over a sluggish dog."
+    "The rapid tawny fox springs over a sluggish dog.",
+    "A quick tawny fox springs over a sluggish dog.",
+    "A rapid golden fox springs over a sluggish dog.",
+    "A rapid tawny fox jumps over a sluggish dog.",
+    "A rapid tawny fox springs over a lazy dog.",
+    "Six rapid tawny foxes spring over a sluggish dog.",
+    "An expeditious yellowish vulpine surpasses a torpid canine.",
+    "Fast as a bullet, the golden hunter vaults over the idle guard.",
+    "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
+    "One V. vulpes displays rapid transit over one inactive C. familiaris.",
+    "A speedy yellow-brown predator bypasses a motionless dog.",
+    "Log entry: Vulpine subject achieves swift vertical displacement.",
+    # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
+    "A fleet-footed chestnut fox soars above an indolent canine.",
+    "The swift chestnut fox soars above an indolent canine.",
+    "The fleet-footed brown fox soars above an indolent canine.",
+    "The fleet-footed chestnut fox leaps above an indolent canine.",
+    "The fleet-footed chestnut fox soars above a lazy canine.",
+    "Several fleet-footed chestnut foxes soar above an indolent canine.",
+    "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
+    "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
+    "Tha quick brown beastie sails o'er the sleepy hound, ken.",
+    "Single agile V. vulpes achieves elevation above stationary canine.",
+    "A nimble brown predator glides over an unmoving domestic animal.",
+    "Research note: Brown subject displays superior vertical mobility.",
+    # Original 7: "A fast ginger fox hurdles past a slothful dog."
+    "The fast ginger fox hurdles past a slothful dog.",
+    "A quick ginger fox hurdles past a slothful dog.",
+    "A fast red fox hurdles past a slothful dog.",
+    "A fast ginger fox jumps past a slothful dog.",
+    "A fast ginger fox hurdles past a lazy dog.",
+    "Five fast ginger foxes hurdle past a slothful dog.",
+    "A rapid orange vulpine bypasses a lethargic canine.",
+    "Quick as lightning, the flame-colored hunter races past the lazy guard.",
+    "Tha swift ginger beastie leaps past the tired doggy, ye see.",
+    "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
+    "A speedy red-orange predator overtakes a motionless dog.",
+    "Data point: Orange subject demonstrates rapid transit past Gray subject.",
+    # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
+    "A spry rusty-colored fox jumps across a dozing hound.",
+    "The agile rusty-colored fox jumps across a dozing hound.",
+    "The spry reddish fox jumps across a dozing hound.",
+    "The spry rusty-colored fox leaps across a dozing hound.",
+    "The spry rusty-colored fox jumps across a sleeping hound.",
+    "Multiple spry rusty-colored foxes jump across a dozing hound.",
+    "An agile rust-toned vulpine traverses a somnolent canine.",
+    "Nimble as thought, the copper hunter bounds over the resting guard.",
+    "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
+    "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
+    "A lithe rust-tinted predator moves past a slumbering dog.",
+    "Observation: Russet subject exhibits agility over dormant subject.",
+    # Original 9: "A quick tan fox leaps over an inactive dog."
+    "The quick tan fox leaps over an inactive dog.",
+    "A swift tan fox leaps over an inactive dog.",
+    "A quick beige fox leaps over an inactive dog.",
+    "A quick tan fox jumps over an inactive dog.",
+    "A quick tan fox leaps over a motionless dog.",
+    "Seven quick tan foxes leap over an inactive dog.",
+    "A rapid light-brown vulpine surpasses a stationary canine.",
+    "Fast as wind, the sand-colored hunter soars over the still guard.",
+    "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
+    "One agile fawn V. vulpes traverses one immobile C. familiaris.",
+    "A fleet tan-colored predator bypasses an unmoving dog.",
+    "Field report: Tan subject demonstrates movement over static subject.",
+    # Original 10: "The brisk auburn vulpine bounces over a listless canine."
+    "Some brisk auburn vulpines bounce over a listless canine.",
+    "The quick auburn vulpine bounces over a listless canine.",
+    "The brisk russet vulpine bounces over a listless canine.",
+    "The brisk auburn fox bounces over a listless canine.",
+    "The brisk auburn vulpine jumps over a listless canine.",
+    "Five brisk auburn vulpines bounce over a listless canine.",
+    "The expeditious specimen supersedes a quiescent Canis lupus.",
+    "Swift as wind, the russet hunter vaults over the idle guardian.",
+    "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
+    "One V. vulpes achieves displacement over inactive C. familiaris.",
+    "A high-velocity auburn predator traverses an immobile animal.",
+    "Final observation: Red subject shows mobility over Gray subject."
+    ]
+    # Create the calculator instance
+    calculator = SentenceDetectabilityCalculator(original_sentence, paraphrased_sentences)
+    # Calculate metrics
+    calculator.calculate_all_metrics()
+    calculator.normalize_metrics()
+    calculator.calculate_combined_detectability()
+    # Plot metrics
+    calculator.plot_metrics()
+    # Get results
+    normalized_metrics = calculator.get_normalized_metrics()
+    combined_detectabilities = calculator.get_combined_detectabilities()
+    print("Normalized Metrics:", normalized_metrics)
+    print("Combined Detectabilities:", combined_detectabilities)

distortion.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Import necessary libraries
+import nltk
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from scipy.special import rel_entr
+from collections import Counter
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+# Download NLTK data if not already present
+nltk.download('punkt', quiet=True)
+class SentenceDistortionCalculator:
+    """
+    A class to calculate and analyze distortion metrics between an original sentence and modified sentences.
+    """
+    def __init__(self, original_sentence, modified_sentences):
+        self.original_sentence = original_sentence
+        self.modified_sentences = modified_sentences
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+        self.model = GPT2LMHeadModel.from_pretrained("gpt2").eval()  # Set model to evaluation mode
+        # Raw metric dictionaries
+        self.metrics = {
+            'levenshtein': {},
+            'word_level_changes': {},
+            'kl_divergences': {},
+            'perplexities': {},
+        }
+        # Combined distortion dictionary
+        self.combined_distortions = {}
+    def calculate_all_metrics(self):
+        """Calculate all distortion metrics for each modified sentence."""
+        for idx, modified_sentence in enumerate(self.modified_sentences):
+            key = f"Sentence_{idx + 1}"
+            self.metrics['levenshtein'][key] = self._calculate_levenshtein_distance(modified_sentence)
+            self.metrics['word_level_changes'][key] = self._calculate_word_level_change(modified_sentence)
+            self.metrics['kl_divergences'][key] = self._calculate_kl_divergence(modified_sentence)
+            self.metrics['perplexities'][key] = self._calculate_perplexity(modified_sentence)
+    def normalize_metrics(self):
+        """Normalize all metrics to be between 0 and 1."""
+        for metric in self.metrics:
+            self.metrics[metric] = self._normalize_dict(self.metrics[metric])
+    def calculate_combined_distortion(self):
+        """Calculate the combined distortion using the root mean square of the normalized metrics."""
+        for key in self.metrics['levenshtein']:
+            rms = np.sqrt(sum(self.metrics[metric][key] ** 2 for metric in self.metrics) / len(self.metrics))
+            self.combined_distortions[key] = rms
+    def plot_metrics(self):
+        """Plot each normalized metric and the combined distortion in separate graphs."""
+        keys = list(self.metrics['levenshtein'].keys())
+        indices = np.arange(len(keys))
+        for metric_name, values in self.metrics.items():
+            plt.figure(figsize=(12, 6))
+            plt.plot(indices, list(values.values()), marker='o', label=metric_name)
+            plt.xlabel('Sentence Index')
+            plt.ylabel('Normalized Value (0-1)')
+            plt.title(f'Normalized {metric_name.replace("_", " ").title()}')
+            plt.grid(True)
+            plt.legend()
+            plt.tight_layout()
+            plt.show()
+    # Private methods for metric calculations
+    def _calculate_levenshtein_distance(self, modified_sentence):
+        """Calculate the Levenshtein Distance between the original and modified sentence."""
+        return nltk.edit_distance(self.original_sentence, modified_sentence)
+    def _calculate_word_level_change(self, modified_sentence):
+        """Calculate the proportion of word-level changes between the original and modified sentence."""
+        original_words = self.original_sentence.split()
+        modified_words = modified_sentence.split()
+        total_words = max(len(original_words), len(modified_words))
+        changed_words = sum(o != m for o, m in zip(original_words, modified_words)) + abs(len(original_words) - len(modified_words))
+        return changed_words / total_words if total_words > 0 else 0
+    def _calculate_kl_divergence(self, modified_sentence):
+        """Calculate the KL Divergence between the word distributions of the original and modified sentence."""
+        original_counts = Counter(self.original_sentence.lower().split())
+        modified_counts = Counter(modified_sentence.lower().split())
+        all_words = set(original_counts.keys()).union(modified_counts.keys())
+        original_probs = np.array([original_counts[word] for word in all_words], dtype=float)
+        modified_probs = np.array([modified_counts[word] for word in all_words], dtype=float)
+        original_probs /= original_probs.sum() + 1e-10  # Avoid division by zero
+        modified_probs /= modified_probs.sum() + 1e-10
+        return np.sum(rel_entr(original_probs, modified_probs))
+    def _calculate_perplexity(self, sentence):
+        """Calculate the perplexity of a sentence using GPT-2."""
+        encodings = self.tokenizer(sentence, return_tensors='pt')
+        stride = self.model.config.n_positions
+        log_likelihoods = []
+        for i in range(0, encodings.input_ids.size(1), stride):
+            input_ids = encodings.input_ids[:, i:i + stride]
+            with torch.no_grad():
+                outputs = self.model(input_ids, labels=input_ids)
+                log_likelihoods.append(outputs.loss.item())
+        avg_log_likelihood = np.mean(log_likelihoods)
+        return torch.exp(torch.tensor(avg_log_likelihood)).item()
+    def _normalize_dict(self, metric_dict):
+        """Normalize the values in a dictionary to be between 0 and 1."""
+        values = np.array(list(metric_dict.values()))
+        min_val, max_val = values.min(), values.max()
+        normalized_values = (values - min_val) / (max_val - min_val) if max_val > min_val else np.zeros_like(values)
+        return dict(zip(metric_dict.keys(), normalized_values))
+    def get_normalized_metrics(self):
+        """Get all normalized metrics as a dictionary."""
+        return {metric: self._normalize_dict(values) for metric, values in self.metrics.items()}
+    def get_combined_distortions(self):
+        """Get the dictionary of combined distortion values."""
+        return self.combined_distortions

entailment.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from transformers import pipeline
+def analyze_entailment(original_sentence, paraphrased_sentences, threshold):
+    # Load the entailment model once
+    entailment_pipe = pipeline("text-classification", model="ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli")
+    all_sentences = {}
+    selected_sentences = {}
+    discarded_sentences = {}
+    # Prepare input for entailment checks
+    inputs = [f"{original_sentence} [SEP] {paraphrase}" for paraphrase in paraphrased_sentences]
+    # Perform entailment checks for all paraphrased sentences in one go
+    entailment_results = entailment_pipe(inputs, return_all_scores=True)
+    # Iterate over results
+    for paraphrased_sentence, results in zip(paraphrased_sentences, entailment_results):
+        # Extract the entailment score for each paraphrased sentence
+        entailment_score = next((result['score'] for result in results if result['label'] == 'entailment'), 0)
+        all_sentences[paraphrased_sentence] = entailment_score
+        # Store sentences based on the threshold
+        if entailment_score >= threshold:
+            selected_sentences[paraphrased_sentence] = entailment_score
+        else:
+            discarded_sentences[paraphrased_sentence] = entailment_score
+    return all_sentences, selected_sentences, discarded_sentences
+# Example usage
+# print(analyze_entailment("I love you", ["I adore you", "I hate you"], 0.7))

euclidean_distance.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Import necessary libraries
+import numpy as np
+import matplotlib.pyplot as plt
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import euclidean_distances
+class SentenceEuclideanDistanceCalculator:
+    """
+    A class to calculate and analyze Euclidean distance between an original sentence and paraphrased sentences.
+    """
+    def __init__(self, original_sentence, paraphrased_sentences):
+        """
+        Initialize the calculator with the original sentence and a list of paraphrased sentences.
+        """
+        self.original_sentence = original_sentence
+        self.paraphrased_sentences = paraphrased_sentences
+        # Load SentenceTransformer model for embedding calculation
+        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        # Precompute the original sentence embedding
+        self.original_embedding = self.model.encode(original_sentence, convert_to_tensor=True)
+        # Calculate Euclidean distances and normalize them
+        self.euclidean_distances = self._calculate_all_metrics()
+        self.normalized_euclidean = self._normalize_dict(self.euclidean_distances)
+    def _calculate_all_metrics(self):
+        """
+        Calculate Euclidean distance between the original and each paraphrased sentence.
+        """
+        distances = {}
+        paraphrase_embeddings = self.model.encode(self.paraphrased_sentences, convert_to_tensor=True)
+        for idx, paraphrase_embedding in enumerate(paraphrase_embeddings):
+            key = f"Sentence_{idx + 1}"
+            distances[key] = euclidean_distances([self.original_embedding], [paraphrase_embedding])[0][0]
+        return distances
+    def _normalize_dict(self, metric_dict):
+        """
+        Normalize the values in a dictionary to be between 0 and 1.
+        """
+        values = np.array(list(metric_dict.values()))
+        min_val, max_val = values.min(), values.max()
+        # Normalize values
+        normalized_values = (values - min_val) / (max_val - min_val) if max_val > min_val else np.zeros_like(values)
+        return dict(zip(metric_dict.keys(), normalized_values))
+    def plot_metrics(self):
+        """
+        Plot the normalized Euclidean distances in a graph.
+        """
+        keys = list(self.normalized_euclidean.keys())
+        indices = np.arange(len(keys))
+        plt.figure(figsize=(12, 6))
+        plt.plot(indices, [self.normalized_euclidean[key] for key in keys], marker='o', color=np.random.rand(3,))
+        plt.xlabel('Sentence Index')
+        plt.ylabel('Normalized Euclidean Distance (0-1)')
+        plt.title('Normalized Euclidean Distance')
+        plt.grid(True)
+        plt.tight_layout()
+        plt.show()
+    # Getter methods
+    def get_normalized_metrics(self):
+        """
+        Get the normalized Euclidean distances as a dictionary.
+        """
+        return self.normalized_euclidean

gpt_mask_filling.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import openai
+import os
+from dotenv import load_dotenv
+load_dotenv()
+openai.api_key = os.getenv("API_KEY")
+#Takes in a sentence and returns a list of dicts consisiting of key-value pairs of masked words and lists of the possible replacements
+def predict_masked_words(sentence, n_suggestions=5):
+    prompt = (
+          f"Given a sentence with masked words, masked word can be one or more than one, indicated by [MASK], generate {n_suggestions} possible words to fill each mask. "
+          "Return the results as a list of dictionaries, where each dictionary key is a masked word and its value is a list of 5 potential words to fill that mask.\n\n"
+          "Example input: \"The [MASK] fox [MASK] over the [MASK] dog.\"\n\n"
+          "Example output:\n"
+          "[\n"
+          "  {\n"
+          "    \"[MASK]1\": [\"quick\", \"sly\", \"red\", \"clever\", \"sneaky\"]\n"
+          "  },\n"
+          "  {\n"
+          "    \"[MASK]2\": [\"jumped\", \"leaped\", \"hopped\", \"sprang\", \"bounded\"]\n"
+          "  },\n"
+          "  {\n"
+          "    \"[MASK]3\": [\"lazy\", \"sleeping\", \"brown\", \"tired\", \"old\"]\n"
+          "  }\n"
+          "]\n\n"
+          "Example input: \"The [MASK] [MASK] ran swiftly across the [MASK] field.\"\n\n"
+          "Example output:\n"
+          "[\n"
+          "  {\n"
+          "    \"[MASK]1\": [\"tall\", \"fierce\", \"young\", \"old\", \"beautiful\"]\n"
+          "  },\n"
+          "  {\n"
+          "    \"[MASK]2\": [\"lion\", \"tiger\", \"horse\", \"cheetah\", \"deer\"]\n"
+          "  },\n"
+          "  {\n"
+          "    \"[MASK]3\": [\"green\", \"wide\", \"sunny\", \"open\", \"empty\"]\n"
+          "  }\n"
+          "]\n\n"
+          "Example input: \"It was a [MASK] day when the train arrived at the station.\"\n\n"
+          "Example output:\n"
+          "[\n"
+          "  {\n"
+          "    \"[MASK]1\": [\"sunny\", \"rainy\", \"cloudy\", \"foggy\", \"stormy\"]\n"
+          "  },\n"
+          "]\n\n"
+          "Now, please process the following sentence:\n"
+          f"{sentence}"
+      )
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ],
+        max_tokens=100,
+        n=1,
+        stop=None,
+        temperature=0.7
+    )
+    print(response['choices'][0]['message']['content'])
+# sentence = "Evacuations and storm [MASK] began on Sunday night as forecasters projected that Hurricane Dorian would hit into Florida’s west coast on Wednesday as a major hurricane packing life-threatening winds and storm surge."
+# predict_masked_words(sentence, n_suggestions=5)

highlighter.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import re
+def highlight_common_words(common_words, sentences, title):
+    color_map = {}
+    highlighted_html = []
+    for idx, sentence in enumerate(sentences, start=1):
+        highlighted_sentence = f"{idx}. {sentence}"
+        for index, word in common_words:
+            if word not in color_map:
+                # Assign color using HSL for better visual distinction
+                color_map[word] = f'hsl({(len(color_map) % 6) * 60}, 70%, 80%)'
+            # Create a regex pattern for the word
+            escaped_word = re.escape(word)
+            pattern = rf'\b{escaped_word}\b'
+            color = color_map[word]
+            # Use a lambda function for word highlighting
+            highlighted_sentence = re.sub(
+                pattern,
+                lambda m: (f'<span style="background-color: {color}; font-weight: bold;'
+                            ' padding: 2px 4px; border-radius: 2px; position: relative;">'
+                            f'<span style="background-color: black; color: white; border-radius: 50%;'
+                            ' padding: 2px 5px; margin-right: 5px;">{index}</span>'
+                            f'{m.group(0)}'
+                            '</span>'),
+                highlighted_sentence,
+                flags=re.IGNORECASE
+            )
+        highlighted_html.append(highlighted_sentence)
+    # Construct the final HTML output
+    return generate_html(title, highlighted_html)
+def highlight_common_words_dict(common_words, sentences, title):
+    color_map = {}
+    highlighted_html = []
+    for idx, (sentence, score) in enumerate(sentences.items(), start=1):
+        highlighted_sentence = f"{idx}. {sentence}"
+        for index, word in common_words:
+            if word not in color_map:
+                color_map[word] = f'hsl({(len(color_map) % 6) * 60}, 70%, 80%)'
+            escaped_word = re.escape(word)
+            pattern = rf'\b{escaped_word}\b'
+            color = color_map[word]
+            highlighted_sentence = re.sub(
+                pattern,
+                lambda m: (f'<span style="background-color: {color}; font-weight: bold;'
+                            ' padding: 1px 2px; border-radius: 2px; position: relative;">'
+                            f'<span style="background-color: black; color: white; border-radius: 50%;'
+                            ' padding: 1px 3px; margin-right: 3px; font-size: 0.8em;">{index}</span>'
+                            f'{m.group(0)}'
+                            '</span>'),
+                highlighted_sentence,
+                flags=re.IGNORECASE
+            )
+        highlighted_html.append(
+            f'<div style="margin-bottom: 5px;">'
+            f'{highlighted_sentence}'
+            f'<div style="display: inline-block; margin-left: 5px; padding: 3px 5px; border-radius: 3px;'
+            ' background-color: white; font-size: 0.9em;">Entailment Score: {score}</div></div>'
+        )
+    return generate_html(title, highlighted_html)
+def generate_html(title, highlighted_html):
+    final_html = "<br><br>".join(highlighted_html)
+    return f'''
+    <div style="border: solid 1px #ccc; padding: 16px; background-color: #FFFFFF; color: #374151;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
+        <h3 style="margin-top: 0; font-size: 1em; color: #111827;">{title}</h3>
+        <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
+    </div>
+    '''
+def reparaphrased_sentences_html(sentences):
+    formatted_sentences = [f"{idx + 1}. {sentence}" for idx, sentence in enumerate(sentences)]
+    final_html = "<br><br>".join(formatted_sentences)
+    return f'''
+    <div style="border: solid 1px #ccc; padding: 16px; background-color: #FFFFFF; color: #374151;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
+        <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
+    </div>
+    '''

lcs.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import re
+from nltk.corpus import stopwords
+def find_common_subsequences(sentence, str_list):
+    # Load stop words
+    stop_words = set(stopwords.words('english'))
+    # Preprocess the input sentence and list of strings
+    sentence = sentence.lower()
+    cleaned_str_list = [s.lower() for s in str_list]
+    def clean_text(text):
+        """Remove stop words and special characters from a given text."""
+        text = re.sub(r'[^\w\s]', '', text)
+        return " ".join(word for word in text.split() if word not in stop_words)
+    cleaned_sentence = clean_text(sentence)
+    cleaned_str_list = [clean_text(s) for s in cleaned_str_list]
+    words = cleaned_sentence.split()
+    common_grams = []
+    added_phrases = set()
+    for n in range(5, 0, -1):  # Check n-grams from size 5 to 1
+        for i in range(len(words) - n + 1):
+            subseq = " ".join(words[i:i + n])
+            if is_present(subseq, cleaned_str_list) and subseq not in added_phrases:
+                common_grams.append((i, subseq))
+                added_phrases.add(subseq)
+    # Sort by the first appearance in the original sentence and create indexed common grams
+    common_grams.sort(key=lambda x: x[0])
+    return [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)]
+def is_present(subseq, str_list):
+    """Check if a subsequence is present in all strings in the list."""
+    subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
+    return all(subseq_regex.search(s) for s in str_list)
+def find_common_gram_positions(str_list, common_grams):
+    """Find positions of common grams in each string from str_list."""
+    positions = []
+    for sentence in str_list:
+        words = re.sub(r'[^\w\s]', '', sentence).lower().split()
+        word_positions = {word: [] for word in words}
+        for idx, word in enumerate(words):
+            word_positions[word].append(idx + 1)  # Store 1-based index positions
+        sentence_positions = []
+        for _, gram in common_grams:
+            gram_words = re.sub(r'[^\w\s]', '', gram).lower().split()
+            if all(word in word_positions for word in gram_words):
+                start_idx = word_positions[gram_words[0]][0]
+                sentence_positions.append(start_idx)
+            else:
+                sentence_positions.append(-1)  # Common gram not found
+        positions.append(sentence_positions)
+    return positions

masking_methods.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
+import random
+from nltk.corpus import stopwords
+import nltk
+from vocabulary_split import split_vocabulary, filter_logits
+# Load tokenizer and model for masked language model
+tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
+# Get permissible vocabulary
+permissible, _ = split_vocabulary(seed=42)
+permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
+# Initialize stop words and ensure NLTK resources are downloaded
+stop_words = set(stopwords.words('english'))
+nltk.download('averaged_perceptron_tagger', quiet=True)
+nltk.download('maxent_ne_chunker', quiet=True)
+nltk.download('words', quiet=True)
+def get_logits_for_mask(sentence):
+    inputs = tokenizer(sentence, return_tensors="pt")
+    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    return logits[0, mask_token_index, :].squeeze()
+def mask_word(sentence, word):
+    masked_sentence = sentence.replace(word, '[MASK]', 1)
+    logits = get_logits_for_mask(masked_sentence)
+    filtered_logits = filter_logits(logits, permissible_indices)
+    words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
+    return masked_sentence, filtered_logits.tolist(), words
+def mask_non_stopword(sentence, pseudo_random=False):
+    non_stop_words = [word for word in sentence.split() if word.lower() not in stop_words]
+    if not non_stop_words:
+        return sentence, None, None
+    if pseudo_random:
+        random.seed(10)  # Fixed seed for pseudo-randomness
+    word_to_mask = random.choice(non_stop_words)
+    return mask_word(sentence, word_to_mask)
+def mask_between_lcs(sentence, lcs_points):
+    words = sentence.split()
+    masked_indices = []
+    # Mask first word before the first LCS point
+    if lcs_points and lcs_points[0] > 0:
+        idx = random.randint(0, lcs_points[0] - 1)
+        words[idx] = '[MASK]'
+        masked_indices.append(idx)
+    # Mask between LCS points
+    for i in range(len(lcs_points) - 1):
+        start, end = lcs_points[i], lcs_points[i + 1]
+        if end - start > 1:
+            mask_index = random.randint(start + 1, end - 1)
+            words[mask_index] = '[MASK]'
+            masked_indices.append(mask_index)
+    # Mask last word after the last LCS point
+    if lcs_points and lcs_points[-1] < len(words) - 1:
+        idx = random.randint(lcs_points[-1] + 1, len(words) - 1)
+        words[idx] = '[MASK]'
+        masked_indices.append(idx)
+    masked_sentence = ' '.join(words)
+    logits = get_logits_for_mask(masked_sentence)
+    logits_list, top_words_list = [], []
+    for idx in masked_indices:
+        filtered_logits = filter_logits(logits[idx], permissible_indices)
+        logits_list.append(filtered_logits.tolist())
+        top_words = [tokenizer.decode([i]) for i in filtered_logits.topk(5).indices.tolist()]
+        top_words_list.append(top_words)
+    return masked_sentence, logits_list, top_words_list
+def high_entropy_words(sentence, non_melting_points):
+    non_melting_words = {word.lower() for _, point in non_melting_points for word in point.split()}
+    candidate_words = [word for word in sentence.split() if word.lower() not in stop_words and word.lower() not in non_melting_words]
+    if not candidate_words:
+        return sentence, None, None
+    max_entropy, max_entropy_word, max_logits = -float('inf'), None, None
+    for word in candidate_words:
+        masked_sentence = sentence.replace(word, '[MASK]', 1)
+        logits = get_logits_for_mask(masked_sentence)
+        filtered_logits = filter_logits(logits, permissible_indices)
+        # Calculate entropy
+        probs = torch.softmax(filtered_logits, dim=-1)
+        top_5_probs = probs.topk(5).values
+        entropy = -torch.sum(top_5_probs * torch.log(top_5_probs + 1e-10))  # Avoid log(0)
+        if entropy > max_entropy:
+            max_entropy, max_entropy_word, max_logits = entropy, word, filtered_logits
+    if max_entropy_word is None:
+        return sentence, None, None
+    masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
+    words = [tokenizer.decode([i]) for i in max_logits.argsort()[-5:]]
+    return masked_sentence, max_logits.tolist(), words
+def mask_by_pos(sentence, pos_to_mask=['NOUN', 'VERB', 'ADJ']):
+    words = nltk.word_tokenize(sentence)
+    pos_tags = nltk.pos_tag(words)
+    maskable_words = [word for word, pos in pos_tags if pos[:2] in pos_to_mask]
+    if not maskable_words:
+        return sentence, None, None
+    word_to_mask = random.choice(maskable_words)
+    return mask_word(sentence, word_to_mask)
+def mask_named_entity(sentence):
+    words = nltk.word_tokenize(sentence)
+    pos_tags = nltk.pos_tag(words)
+    named_entities = nltk.ne_chunk(pos_tags)
+    maskable_words = [word for word, tag in named_entities.leaves() if isinstance(tag, nltk.Tree)]
+    if not maskable_words:
+        return sentence, None, None
+    word_to_mask = random.choice(maskable_words)
+    return mask_word(sentence, word_to_mask)

masking_methods_trial.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from transformers import pipeline
+import random
+from nltk.corpus import stopwords
+import nltk
+nltk.download('stopwords')
+import math
+from vocabulary_split import split_vocabulary, filter_logits
+import abc
+from typing import List
+# Load tokenizer and model for masked language model
+tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
+# Get permissible vocabulary
+permissible, _ = split_vocabulary(seed=42)
+permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
+def get_logits_for_mask(model, tokenizer, sentence):
+    inputs = tokenizer(sentence, return_tensors="pt")
+    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    mask_token_logits = logits[0, mask_token_index, :]
+    return mask_token_logits.squeeze()
+# Abstract Masking Strategy
+class MaskingStrategy(abc.ABC):
+    @abc.abstractmethod
+    def select_words_to_mask(self, words: List[str], **kwargs) -> List[int]:
+        """
+        Given a list of words, return the indices of words to mask.
+        """
+        pass
+# Specific Masking Strategies
+class RandomNonStopwordMasking(MaskingStrategy):
+    def __init__(self, num_masks: int = 1):
+        self.num_masks = num_masks
+        self.stop_words = set(stopwords.words('english'))
+    def select_words_to_mask(self, words: List[str], **kwargs) -> List[int]:
+        non_stop_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        if not non_stop_indices:
+            return []
+        num_masks = min(self.num_masks, len(non_stop_indices))
+        return random.sample(non_stop_indices, num_masks)
+class HighEntropyMasking(MaskingStrategy):
+    def __init__(self, num_masks: int = 1):
+        self.num_masks = num_masks
+    def select_words_to_mask(self, words: List[str], sentence: str, model, tokenizer, permissible_indices) -> List[int]:
+        candidate_indices = [i for i, word in enumerate(words) if word.lower() not in set(stopwords.words('english'))]
+        if not candidate_indices:
+            return []
+        entropy_scores = {}
+        for idx in candidate_indices:
+            masked_sentence = ' '.join(words[:idx] + ['[MASK]'] + words[idx+1:])
+            logits = get_logits_for_mask(model, tokenizer, masked_sentence)
+            filtered_logits = filter_logits(logits, permissible_indices)
+            probs = torch.softmax(filtered_logits, dim=-1)
+            top_5_probs = probs.topk(5).values
+            entropy = -torch.sum(top_5_probs * torch.log(top_5_probs + 1e-10)).item()
+            entropy_scores[idx] = entropy
+        # Select top N indices with highest entropy
+        sorted_indices = sorted(entropy_scores, key=entropy_scores.get, reverse=True)
+        return sorted_indices[:self.num_masks]
+class PseudoRandomNonStopwordMasking(MaskingStrategy):
+    def __init__(self, num_masks: int = 1, seed: int = 10):
+        self.num_masks = num_masks
+        self.seed = seed
+        self.stop_words = set(stopwords.words('english'))
+    def select_words_to_mask(self, words: List[str], **kwargs) -> List[int]:
+        non_stop_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        if not non_stop_indices:
+            return []
+        random.seed(self.seed)
+        num_masks = min(self.num_masks, len(non_stop_indices))
+        return random.sample(non_stop_indices, num_masks)
+class CompositeMaskingStrategy(MaskingStrategy):
+    def __init__(self, strategies: List[MaskingStrategy]):
+        self.strategies = strategies
+    def select_words_to_mask(self, words: List[str], **kwargs) -> List[int]:
+        selected_indices = []
+        for strategy in self.strategies:
+            if isinstance(strategy, HighEntropyMasking):
+                selected = strategy.select_words_to_mask(words, **kwargs)
+            else:
+                selected = strategy.select_words_to_mask(words)
+            selected_indices.extend(selected)
+        return list(set(selected_indices))  # Remove duplicates
+# Refactored mask_between_lcs function
+def mask_between_lcs(sentence, lcs_points, masking_strategy: MaskingStrategy, model, tokenizer, permissible_indices):
+    words = sentence.split()
+    masked_indices = []
+    segments = []
+    # Define segments based on LCS points
+    previous = 0
+    for point in lcs_points:
+        if point > previous:
+            segments.append((previous, point))
+        previous = point + 1
+    if previous < len(words):
+        segments.append((previous, len(words)))
+    # Collect all indices to mask from each segment
+    for start, end in segments:
+        segment_words = words[start:end]
+        if isinstance(masking_strategy, HighEntropyMasking):
+            selected = masking_strategy.select_words_to_mask(segment_words, sentence, model, tokenizer, permissible_indices)
+        else:
+            selected = masking_strategy.select_words_to_mask(segment_words)
+        # Adjust indices relative to the whole sentence
+        for idx in selected:
+            masked_idx = start + idx
+            if masked_idx not in masked_indices:
+                masked_indices.append(masked_idx)
+    # Apply masking
+    for idx in masked_indices:
+        words[idx] = '[MASK]'
+    masked_sentence = ' '.join(words)
+    logits = get_logits_for_mask(model, tokenizer, masked_sentence)
+    # Process each masked token
+    top_words_list = []
+    logits_list = []
+    for i, idx in enumerate(masked_indices):
+        logits_i = logits[i]
+        if logits_i.dim() > 1:
+            logits_i = logits_i.squeeze()
+        filtered_logits_i = filter_logits(logits_i, permissible_indices)
+        logits_list.append(filtered_logits_i.tolist())
+        top_5_indices = filtered_logits_i.topk(5).indices.tolist()
+        top_words = [tokenizer.decode([i]) for i in top_5_indices]
+        top_words_list.append(top_words)
+    return masked_sentence, logits_list, top_words_list
+# Example Usage
+if __name__ == "__main__":
+    # Example sentence and LCS points
+    sentence = "This is a sample sentence with some LCS points"
+    lcs_points = [2, 5, 8]  # Indices of LCS points
+    # Initialize masking strategies
+    random_non_stopword_strategy = RandomNonStopwordMasking(num_masks=1)
+    high_entropy_strategy = HighEntropyMasking(num_masks=1)
+    pseudo_random_strategy = PseudoRandomNonStopwordMasking(num_masks=1, seed=10)
+    composite_strategy = CompositeMaskingStrategy([
+        RandomNonStopwordMasking(num_masks=1),
+        HighEntropyMasking(num_masks=1)
+    ])
+    # Choose a strategy
+    chosen_strategy = composite_strategy  # You can choose any initialized strategy
+    # Apply masking
+    masked_sentence, logits_list, top_words_list = mask_between_lcs(
+        sentence,
+        lcs_points,
+        masking_strategy=chosen_strategy,
+        model=model,
+        tokenizer=tokenizer,
+        permissible_indices=permissible_indices
+    )
+    print("Masked Sentence:", masked_sentence)
+    for idx, top_words in enumerate(top_words_list):
+        print(f"Top words for mask {idx+1}:", top_words)

paraphraser.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from openai import OpenAI
+from dotenv import load_dotenv
+import os
+# Load environment variables
+load_dotenv()
+key = os.getenv("OPENAI_API_KEY")
+# Initialize the OpenAI client
+client = OpenAI(api_key=key)
+def generate_paraphrase(sentences, model="gpt-4", num_paraphrases=10, max_tokens=150, temperature=0.7):
+    """Generate paraphrased sentences using the OpenAI GPT-4 model."""
+    # Ensure sentences is a list
+    if isinstance(sentences, str):
+        sentences = [sentences]
+    paraphrased_sentences_list = []
+    for sentence in sentences:
+        full_prompt = f"Paraphrase the following text: '{sentence}'"
+        try:
+            chat_completion = client.chat.completions.create(
+                messages=[{"role": "user", "content": full_prompt}],
+                model=model,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                n=num_paraphrases  # Number of paraphrased sentences to generate
+            )
+            # Extract paraphrased sentences
+            paraphrased_sentences = [choice.message.content.strip() for choice in chat_completion.choices]
+            paraphrased_sentences_list.extend(paraphrased_sentences)
+        except Exception as e:
+            print(f"Error paraphrasing sentence '{sentence}': {e}")
+    return paraphrased_sentences_list
+# Example usage
+result = generate_paraphrase(
+    "Mayor Eric Adams did not attend the first candidate forum for the New York City mayoral race, but his record — and the criminal charges he faces — received plenty of attention on Saturday from the Democrats who are running to unseat him."
+)
+print(f"Number of paraphrases generated: {len(result)}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+ipywidgets
+transformers
+plotly
+requests
+Pillow
+numpy
+matplotlib
+tqdm
+scipy
+torch
+seaborn
+termcolor
+nltk
+tenacity
+pandas
+graphviz==0.20.3
+gradio==4.29.0
+openai
+python-dotenv
+scikit-learn
+sentence-transformers

sampling_methods.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+import random
+from vocabulary_split import split_vocabulary, filter_logits
+from masking_methods import tokenizer
+# Get permissible vocabulary
+permissible, _ = split_vocabulary(seed=42)
+permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
+def sample_word(sentence, words, logits, sampling_technique='inverse_transform', temperature=1.0):
+    # Convert logits to a tensor and filter based on permissible indices
+    filtered_logits = filter_logits(torch.tensor(logits), permissible_indices)
+    probs = torch.softmax(filtered_logits / temperature, dim=-1)
+    # Select sampling technique
+    if sampling_technique == 'inverse_transform':
+        cumulative_probs = torch.cumsum(probs, dim=-1)
+        random_prob = random.random()
+        sampled_index = torch.searchsorted(cumulative_probs, random_prob)
+    elif sampling_technique == 'exponential_minimum':
+        exp_probs = torch.exp(-torch.log(probs))
+        sampled_index = torch.argmax(random.rand_like(exp_probs) * exp_probs)
+    elif sampling_technique == 'temperature':
+        sampled_index = torch.multinomial(probs, 1).item()
+    elif sampling_technique == 'greedy':
+        sampled_index = torch.argmax(filtered_logits).item()
+    else:
+        raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
+    sampled_word = tokenizer.decode([sampled_index])
+    # Replace [MASK] with the sampled word
+    filled_sentence = sentence.replace('[MASK]', sampled_word)
+    return filled_sentence

scores.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import numpy as np
+from nltk.translate.bleu_score import sentence_bleu
+from transformers import BertTokenizer, BertModel
+# Function to Calculate the BLEU score
+def calculate_bleu(reference, candidate):
+    return sentence_bleu([reference], candidate)
+# Function to calculate BERT score
+def calculate_bert(reference, candidate):
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    model = BertModel.from_pretrained('bert-base-uncased')
+    reference_tokens = tokenizer.tokenize(reference)
+    candidate_tokens = tokenizer.tokenize(candidate)
+    reference_ids = tokenizer.encode(reference, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
+    candidate_ids = tokenizer.encode(candidate, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
+    with torch.no_grad():
+        reference_outputs = model(reference_ids)
+        candidate_outputs = model(candidate_ids)
+    reference_embeddings = reference_outputs[0][:, 0, :].numpy()
+    candidate_embeddings = candidate_outputs[0][:, 0, :].numpy()
+    cosine_similarity = np.dot(reference_embeddings, candidate_embeddings.T) / (np.linalg.norm(reference_embeddings) * np.linalg.norm(candidate_embeddings))
+    return np.mean(cosine_similarity)
+# Function to calculate minimum edit distance
+def min_edit_distance(reference, candidate):
+    m = len(reference)
+    n = len(candidate)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0:
+                dp[i][j] = j
+            elif j == 0:
+                dp[i][j] = i
+            elif reference[i - 1] == candidate[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1]
+            else:
+                dp[i][j] = 1 + min(dp[i][j - 1],         # Insert
+                                   dp[i - 1][j],         # Remove
+                                   dp[i - 1][j - 1])    # Replace
+    return dp[m][n]

threeD_plot.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import numpy as np
+import plotly.graph_objects as go
+from scipy.interpolate import griddata
+def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
+    # Convert input lists to NumPy arrays
+    detectability = np.array(detectability_val)
+    distortion = np.array(distortion_val)
+    euclidean = np.array(euclidean_val)
+    # Normalize the values to range [0, 1]
+    def normalize(data):
+        min_val, max_val = np.min(data), np.max(data)
+        return (data - min_val) / (max_val - min_val) if max_val > min_val else np.zeros_like(data)
+    norm_detectability = normalize(detectability)
+    norm_distortion = normalize(distortion)
+    norm_euclidean = normalize(euclidean)
+    # Composite score: maximize detectability, minimize distortion and Euclidean distance
+    composite_score = norm_detectability - (norm_distortion + norm_euclidean)
+    # Sweet spot values
+    sweet_spot_index = np.argmax(composite_score)
+    sweet_spot = (detectability[sweet_spot_index], distortion[sweet_spot_index], euclidean[sweet_spot_index])
+    # Create a meshgrid for interpolation
+    x_grid, y_grid = np.meshgrid(
+        np.linspace(np.min(detectability), np.max(detectability), 30),
+        np.linspace(np.min(distortion), np.max(distortion), 30)
+    )
+    # Interpolate z values (Euclidean distances) to fit the grid
+    z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='linear')
+    if z_grid is None:
+        raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
+    # Create the 3D contour plot with the Plasma color scale
+    fig = go.Figure(data=go.Surface(
+        z=z_grid,
+        x=x_grid,
+        y=y_grid,
+        contours={"z": {"show": True, "start": np.min(euclidean), "end": np.max(euclidean), "size": 0.1, "usecolormap": True}},
+        colorscale='Plasma'
+    ))
+    # Add a marker for the sweet spot
+    fig.add_trace(go.Scatter3d(
+        x=[sweet_spot[0]],
+        y=[sweet_spot[1]],
+        z=[sweet_spot[2]],
+        mode='markers+text',
+        marker=dict(size=10, color='red', symbol='circle'),
+        text=["Sweet Spot"],
+        textposition="top center"
+    ))
+    # Set axis labels
+    fig.update_layout(
+        scene=dict(
+            xaxis_title='Detectability Score',
+            yaxis_title='Distortion Score',
+            zaxis_title='Euclidean Distance'
+        ),
+        margin=dict(l=0, r=0, b=0, t=0)
+    )
+    return fig

tree.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import plotly.graph_objects as go
+import textwrap
+import re
+from collections import defaultdict
+def apply_lcs_numbering(sentence, common_grams):
+    """Apply LCS numbering based on common grams."""
+    for idx, lcs in common_grams:
+        sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
+    return sentence
+def highlight_words(sentence, color_map):
+    """Highlight specified words in a sentence with corresponding colors."""
+    for word, color in color_map.items():
+        sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+    return sentence
+def clean_and_wrap_nodes(nodes, highlight_info):
+    """Clean nodes by removing labels and wrap text for display."""
+    global_color_map = dict(highlight_info)
+    cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+    highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+    return ['<br>'.join(textwrap.wrap(node, width=55)) for node in highlighted_nodes]
+def get_levels_and_edges(nodes):
+    """Determine levels and create edges dynamically."""
+    levels = {}
+    edges = []
+    for i, node in enumerate(nodes):
+        level = int(node.split()[-1][1])
+        levels[i] = level
+    # Create edges from level 0 to level 1 nodes
+    root_node = next(i for i, level in levels.items() if level == 0)
+    edges.extend((root_node, i) for i, level in levels.items() if level == 1)
+    return levels, edges
+def calculate_positions(levels):
+    """Calculate x, y positions for each node based on levels."""
+    positions = {}
+    level_heights = defaultdict(int)
+    y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
+    for node, level in levels.items():
+        level_heights[level] += 1
+        x_gap = 2
+        l1_y_gap = 10
+        positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+        y_offsets[level] += 1
+    return positions
+def color_highlighted_words(node, color_map):
+    """Highlight words in a wrapped node string."""
+    parts = re.split(r'(\{\{.*?\}\})', node)
+    colored_parts = [
+        f"<span style='color: {color_map.get(match.group(1), 'black')};'>{match.group(1)}</span>"
+        if (match := re.match(r'\{\{(.*?)\}\}', part))
+        else part
+        for part in parts
+    ]
+    return ''.join(colored_parts)
+def generate_subplot(paraphrased_sentence, scheme_sentences, highlight_info, common_grams, subplot_number):
+    """Generate a subplot based on the input sentences and highlight info."""
+    # Combine nodes into one list with appropriate labels
+    nodes = [paraphrased_sentence + ' L0'] + [s + ' L1' for s in scheme_sentences]
+    # Apply LCS numbering and clean/wrap nodes
+    nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
+    wrapped_nodes = clean_and_wrap_nodes(nodes, highlight_info)
+    # Get levels and edges
+    levels, edges = get_levels_and_edges(nodes)
+    positions = calculate_positions(levels)
+    # Create figure
+    fig = go.Figure()
+    # Add nodes and edges to the figure
+    for i, node in enumerate(wrapped_nodes):
+        colored_node = color_highlighted_words(node, dict(highlight_info))
+        x, y = positions[i]
+        fig.add_trace(go.Scatter(
+            x=[-x],  # Reflect the x coordinate
+            y=[y],
+            mode='markers',
+            marker=dict(size=10, color='blue'),
+            hoverinfo='none'
+        ))
+        fig.add_annotation(
+            x=-x,  # Reflect the x coordinate
+            y=y,
+            text=colored_node,
+            showarrow=False,
+            xshift=15,
+            align="center",
+            font=dict(size=12),
+            bordercolor='black',
+            borderwidth=1,
+            borderpad=2,
+            bgcolor='white',
+            width=300,
+            height=120
+        )
+    # Add edges and edge annotations
+    edge_texts = [
+        "Highest Entropy Masking", "Pseudo-random Masking", "Random Masking",
+        "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
+        "Inverse Transform Sampling", "Greedy Sampling", "Temperature Sampling",
+        "Exponential Minimum Sampling", "Inverse Transform Sampling",
+        "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
+        "Inverse Transform Sampling"
+    ]
+    for i, edge in enumerate(edges):
+        x0, y0 = positions[edge[0]]
+        x1, y1 = positions[edge[1]]
+        fig.add_trace(go.Scatter(
+            x=[-x0, -x1],  # Reflect the x coordinates
+            y=[y0, y1],
+            mode='lines',
+            line=dict(color='black', width=1)
+        ))
+        # Add text annotation above the edge
+        mid_x = (-x0 + -x1) / 2
+        mid_y = (y0 + y1) / 2
+        fig.add_annotation(
+            x=mid_x,
+            y=mid_y + 0.8,  # Adjust y position to shift text upwards
+            text=edge_texts[i],  # Use the text specific to this edge
+            showarrow=False,
+            font=dict(size=12),
+            align="center"
+        )
+    fig.update_layout(
+        showlegend=False,
+        margin=dict(t=20, b=20, l=20, r=20),
+        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        width=1435,
+        height=1000
+    )
+    return fig
+def generate_subplot1(paraphrased_sentence, scheme_sentences, highlight_info, common_grams):
+    return generate_subplot(paraphrased_sentence, scheme_sentences, highlight_info, common_grams, subplot_number=1)
+def generate_subplot2(scheme_sentences, sampled_sentence, highlight_info, common_grams):
+    nodes = scheme_sentences + [s + ' L1' for s in sampled_sentence]
+    for i in range(len(scheme_sentences)):
+        nodes[i] += ' L0'  # Reassign levels
+    # Apply LCS numbering and clean/wrap nodes
+    nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
+    wrapped_nodes = clean_and_wrap_nodes(nodes, highlight_info)
+    # Get levels and edges
+    levels, edges = get_levels_and_edges(nodes)
+    positions = calculate_positions(levels)
+    # Create figure
+    fig2 = go.Figure()
+    # Add nodes and edges to the figure
+    for i, node in enumerate(wrapped_nodes):
+        colored_node = color_highlighted_words(node, dict(highlight_info))
+        x, y = positions[i]
+        fig2.add_trace(go.Scatter(
+            x=[-x],  # Reflect the x coordinate
+            y=[y],
+            mode='markers',
+            marker=dict(size=10, color='blue'),
+            hoverinfo='none'
+        ))
+        fig2.add_annotation(
+            x=-x,  # Reflect the x coordinate
+            y=y,
+            text=colored_node,
+            showarrow=False,
+            xshift=15,
+            align="center",
+            font=dict(size=12),
+            bordercolor='black',
+            borderwidth=1,
+            borderpad=2,
+            bgcolor='white',
+            width=450,
+            height=65
+        )
+    # Add edges and text above each edge
+    edge_texts = [
+        "Highest Entropy Masking", "Pseudo-random Masking", "Random Masking",
+        "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
+        "Inverse Transform Sampling", "Greedy Sampling", "Temperature Sampling",
+        "Exponential Minimum Sampling", "Inverse Transform Sampling",
+        "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
+        "Inverse Transform Sampling"
+    ]
+    for i, edge in enumerate(edges):
+        x0, y0 = positions[edge[0]]
+        x1, y1 = positions[edge[1]]
+        fig2.add_trace(go.Scatter(
+            x=[-x0, -x1],  # Reflect the x coordinates
+            y=[y0, y1],
+            mode='lines',
+            line=dict(color='black', width=1)
+        ))
+        # Add text annotation above the edge
+        mid_x = (-x0 + -x1) / 2
+        mid_y = (y0 + y1) / 2
+        fig2.add_annotation(
+            x=mid_x,
+            y=mid_y + 0.8,  # Adjust y position to shift text upwards
+            text=edge_texts[i],  # Use the text specific to this edge
+            showarrow=False,
+            font=dict(size=12),
+            align="center"
+        )
+    fig2.update_layout(
+        showlegend=False,
+        margin=dict(t=20, b=20, l=20, r=20),
+        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        width=1435,
+        height=1000
+    )
+    return fig2

vocabulary_split.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import random
+import torch
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+# Load tokenizer and model once
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
+def split_vocabulary(seed=42):
+    """Split the vocabulary into permissible and non-permissible buckets."""
+    # Get the full vocabulary
+    vocab = list(tokenizer.get_vocab().items())
+    # Initialize the random number generator
+    random.seed(seed)
+    # Split the vocabulary
+    permissible = {}
+    non_permissible = {}
+    for word, index in vocab:
+        target_dict = permissible if random.random() < 0.5 else non_permissible
+        target_dict[word] = index
+    return permissible, non_permissible
+def get_logits_for_mask(sentence):
+    """Get the logits for the masked token in the sentence."""
+    inputs = tokenizer(sentence, return_tensors="pt")
+    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+    with torch.no_grad():
+        logits = model(**inputs).logits[0, mask_token_index, :]
+    return logits.squeeze()
+def filter_logits(logits, permissible_indices):
+    """Filter logits based on permissible indices."""
+    filtered_logits = logits.clone()
+    # Set logits to -inf for non-permissible indices
+    filtered_logits[~permissible_indices] = float('-inf')
+    return filtered_logits
+# Usage example
+permissible, _ = split_vocabulary(seed=42)
+# Create permissible indices tensor
+permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))], dtype=torch.bool)
+# When sampling:
+sentence = "The [MASK] is bright today."
+logits = get_logits_for_mask(sentence)
+filtered_logits = filter_logits(logits, permissible_indices)

watermark_detector.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import nltk
+from nltk.corpus import stopwords
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from vocabulary_split import split_vocabulary, filter_logits
+import torch
+from lcs import find_common_subsequences
+from paraphraser import generate_paraphrase
+nltk.download('punkt', quiet=True)
+nltk.download('stopwords', quiet=True)
+tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+permissible, _ = split_vocabulary(seed=42)
+permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
+def get_non_melting_points(original_sentence):
+    paraphrased_sentences = generate_paraphrase(original_sentence)
+    common_subsequences = find_common_subsequences(original_sentence, paraphrased_sentences)
+    return common_subsequences
+def get_word_between_points(sentence, start_point, end_point):
+    words = nltk.word_tokenize(sentence)
+    stop_words = set(stopwords.words('english'))
+    start_index = sentence.index(start_point[1])
+    end_index = sentence.index(end_point[1])
+    for word in words[start_index+1:end_index]:
+        if word.lower() not in stop_words:
+            return word, words.index(word)
+    return None, None
+def get_logits_for_mask(sentence):
+    inputs = tokenizer(sentence, return_tensors="pt")
+    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    mask_token_logits = logits[0, mask_token_index, :]
+    return mask_token_logits.squeeze()
+def detect_watermark(sentence):
+    non_melting_points = get_non_melting_points(sentence)
+    if len(non_melting_points) < 2:
+        return False, "Not enough non-melting points found."
+    word_to_check, index = get_word_between_points(sentence, non_melting_points[0], non_melting_points[1])
+    if word_to_check is None:
+        return False, "No suitable word found between non-melting points."
+    words = nltk.word_tokenize(sentence)
+    masked_sentence = ' '.join(words[:index] + ['[MASK]'] + words[index+1:])
+    logits = get_logits_for_mask(masked_sentence)
+    filtered_logits = filter_logits(logits, permissible_indices)
+    top_predictions = filtered_logits.argsort()[-5:]
+    predicted_words = [tokenizer.decode([i]) for i in top_predictions]
+    if word_to_check in predicted_words:
+        return True, f"Watermark detected. The word '{word_to_check}' is in the permissible vocabulary."
+    else:
+        return False, f"No watermark detected. The word '{word_to_check}' is not in the permissible vocabulary."
+# Example usage
+# if __name__ == "__main__":
+#     test_sentence = "The quick brown fox jumps over the lazy dog."
+#     is_watermarked, message = detect_watermark(test_sentence)
+#     print(f"Is the sentence watermarked? {is_watermarked}")
+#     print(f"Detection message: {message}")