BheemaShankerNeyigapula commited on
Commit
ea6afa4
1 Parent(s): 00dafdf

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ __pycache__/
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,12 +1,8 @@
1
  ---
2
- title: Aiisc Watermarking Model
3
- emoji: 🚀
4
- colorFrom: blue
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.4.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: aiisc-watermarking-model
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.36.0
6
  ---
7
 
8
+ Clone the repository and ``cd`` into it. Run ``gradio app.py`` to start the server.
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ nltk.download('stopwords')
3
+ import plotly.graph_objs as go
4
+ from transformers import pipeline
5
+ import random
6
+ import gradio as gr
7
+ from tree import generate_subplot1, generate_subplot2
8
+ from paraphraser import generate_paraphrase
9
+ from lcs import find_common_subsequences, find_common_gram_positions
10
+ from highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
11
+ from entailment import analyze_entailment
12
+ from masking_methods import mask_non_stopword, high_entropy_words
13
+ from sampling_methods import sample_word
14
+ from detectability import SentenceDetectabilityCalculator
15
+ from distortion import SentenceDistortionCalculator
16
+ from euclidean_distance import SentenceEuclideanDistanceCalculator
17
+ from threeD_plot import gen_three_D_plot
18
+
19
+
20
+ # Function for the Gradio interface
21
+ def model(prompt):
22
+ user_prompt = prompt
23
+ paraphrased_sentences = generate_paraphrase(user_prompt)
24
+ analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(user_prompt, paraphrased_sentences, 0.7)
25
+
26
+ common_grams = find_common_subsequences(user_prompt, selected_sentences)
27
+ subsequences = [subseq for _, subseq in common_grams]
28
+ common_grams_position = find_common_gram_positions(selected_sentences, subsequences)
29
+
30
+ # Create masked results using a single loop
31
+ masked_results = []
32
+ for sentence in paraphrased_sentences:
33
+ masked_results.extend([
34
+ (mask_non_stopword, sentence),
35
+ (mask_non_stopword, sentence, True),
36
+ (high_entropy_words, sentence, common_grams)
37
+ ])
38
+
39
+ # Process masking functions and unpack results
40
+ masked_outputs = [
41
+ (func(sent) if len(result) == 2 else func(sent, extra))
42
+ for func, sent, *extra in masked_results
43
+ for result in [func(sent, *extra)]
44
+ ]
45
+
46
+ # Unpack masked outputs into separate lists
47
+ masked_sentences, masked_words, masked_logits = zip(*masked_outputs) if masked_outputs else ([], [], [])
48
+
49
+ sampled_sentences = []
50
+ for masked_sent, words, logits in zip(masked_sentences, masked_words, masked_logits):
51
+ for technique in ['inverse_transform', 'exponential_minimum', 'temperature', 'greedy']:
52
+ sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique=technique, temperature=1.0))
53
+
54
+ colors = ["red", "blue", "brown", "green"]
55
+
56
+ def select_color():
57
+ return random.choice(colors)
58
+
59
+ highlight_info = [(word, select_color()) for _, word in common_grams]
60
+
61
+ highlighted_user_prompt = highlight_common_words(common_grams, [user_prompt], "Non-melting Points in the User Prompt")
62
+ highlighted_accepted_sentences = highlight_common_words_dict(common_grams, selected_sentences, "Paraphrased Sentences")
63
+ highlighted_discarded_sentences = highlight_common_words_dict(common_grams, discarded_sentences, "Discarded Sentences")
64
+
65
+ trees1, trees2 = [], []
66
+
67
+ for i, sentence in enumerate(paraphrased_sentences):
68
+ next_masked_sentences = masked_sentences[i * 3:(i + 1) * 3]
69
+ next_sampled_sentences = sampled_sentences[i * 12:(i + 1) * 12]
70
+
71
+ tree1 = generate_subplot1(sentence, next_masked_sentences, highlight_info, common_grams)
72
+ trees1.append(tree1)
73
+
74
+ tree2 = generate_subplot2(next_masked_sentences, next_sampled_sentences, highlight_info, common_grams)
75
+ trees2.append(tree2)
76
+
77
+ reparaphrased_sentences = generate_paraphrase(sampled_sentences)
78
+
79
+ # Process the sentences in batches of 10
80
+ reparaphrased_sentences_list = []
81
+ for i in range(0, len(reparaphrased_sentences), 10):
82
+ batch = reparaphrased_sentences[i:i + 10]
83
+ if len(batch) == 10:
84
+ html_block = reparaphrased_sentences_html(batch)
85
+ reparaphrased_sentences_list.append(html_block)
86
+
87
+ # Calculate metrics
88
+ distortion_calculator = SentenceDistortionCalculator(user_prompt, reparaphrased_sentences)
89
+ distortion_calculator.calculate_all_metrics()
90
+ distortion_calculator.normalize_metrics()
91
+ distortion = distortion_calculator.get_combined_distortions()
92
+ distortion_list = list(distortion.values())
93
+
94
+ detectability_calculator = SentenceDetectabilityCalculator(user_prompt, reparaphrased_sentences)
95
+ detectability_calculator.calculate_all_metrics()
96
+ detectability_calculator.normalize_metrics()
97
+ detectability = detectability_calculator.get_combined_detectabilities()
98
+ detectability_list = list(detectability.values())
99
+
100
+ euclidean_dist_calculator = SentenceEuclideanDistanceCalculator(user_prompt, reparaphrased_sentences)
101
+ euclidean_dist_calculator.calculate_all_metrics()
102
+ euclidean_dist_calculator.normalize_metrics()
103
+ euclidean_dist = euclidean_dist_calculator.get_normalized_metrics()
104
+ euclidean_dist_list = list(euclidean_dist.values())
105
+
106
+ three_D_plot = gen_three_D_plot(detectability_list, distortion_list, euclidean_dist_list)
107
+
108
+ return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees1 + trees2 + reparaphrased_sentences_list + [three_D_plot]
109
+
110
+
111
+ # Gradio Interface
112
+ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
113
+ gr.Markdown("# **AIISC Watermarking Model**")
114
+
115
+ with gr.Row():
116
+ user_input = gr.Textbox(label="User Prompt")
117
+
118
+ with gr.Row():
119
+ submit_button = gr.Button("Submit")
120
+ clear_button = gr.Button("Clear")
121
+
122
+ with gr.Row():
123
+ highlighted_user_prompt = gr.HTML()
124
+
125
+ with gr.Row():
126
+ with gr.Tabs():
127
+ with gr.TabItem("Paraphrased Sentences"):
128
+ highlighted_accepted_sentences = gr.HTML()
129
+ with gr.TabItem("Discarded Sentences"):
130
+ highlighted_discarded_sentences = gr.HTML()
131
+
132
+ with gr.Row():
133
+ gr.Markdown("### Where to Watermark?") # Label for masked sentences trees
134
+ with gr.Row():
135
+ with gr.Tabs():
136
+ tree1_tabs = [gr.Plot() for _ in range(10)] # Adjust this range according to the number of trees
137
+ for i, tree1 in enumerate(tree1_tabs):
138
+ with gr.TabItem(f"Sentence {i + 1}"):
139
+ pass # Placeholder for each tree plot
140
+
141
+ with gr.Row():
142
+ gr.Markdown("### How to Watermark?") # Label for sampled sentences trees
143
+ with gr.Row():
144
+ with gr.Tabs():
145
+ tree2_tabs = [gr.Plot() for _ in range(10)] # Adjust this range according to the number of trees
146
+ for i, tree2 in enumerate(tree2_tabs):
147
+ with gr.TabItem(f"Sentence {i + 1}"):
148
+ pass # Placeholder for each tree plot
149
+
150
+ with gr.Row():
151
+ gr.Markdown("### Re-paraphrased Sentences") # Label for re-paraphrased sentences
152
+
153
+ with gr.Row():
154
+ with gr.Tabs():
155
+ reparaphrased_sentences_tabs = [gr.HTML() for _ in range(120)] # 120 tabs for 120 batches of sentences
156
+ for i, reparaphrased_sent_html in enumerate(reparaphrased_sentences_tabs):
157
+ with gr.TabItem(f"Sentence {i + 1}"):
158
+ pass # Placeholder for each batch
159
+
160
+ with gr.Row():
161
+ gr.Markdown("### 3D Plot for Sweet Spot")
162
+ with gr.Row():
163
+ three_D_plot = gr.Plot()
164
+
165
+ submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
166
+ clear_button.click(lambda: "", inputs=None, outputs=user_input)
167
+ clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs + reparaphrased_sentences_tabs + [three_D_plot])
168
+
169
+ demo.launch(share=True)
detectability.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import nltk
3
+ import numpy as np
4
+ import torch
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from transformers import BertModel, BertTokenizer
8
+ from sentence_transformers import SentenceTransformer
9
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
10
+
11
+ # Download NLTK data if not already present
12
+ nltk.download('punkt', quiet=True)
13
+
14
+ class SentenceDetectabilityCalculator:
15
+ """
16
+ A class to calculate and analyze detectability metrics between an original sentence and paraphrased sentences.
17
+ """
18
+
19
+ def __init__(self, original_sentence, paraphrased_sentences):
20
+ """
21
+ Initialize the calculator with the original sentence and a list of paraphrased sentences.
22
+ """
23
+ self.original_sentence = original_sentence
24
+ self.paraphrased_sentences = paraphrased_sentences
25
+ self.metrics = {
26
+ 'BLEU Score': {},
27
+ 'Cosine Similarity': {},
28
+ 'STS Score': {}
29
+ }
30
+ self.normalized_metrics = {
31
+ 'BLEU Score': {},
32
+ 'Cosine Similarity': {},
33
+ 'STS Score': {}
34
+ }
35
+ self.combined_detectabilities = {}
36
+
37
+ # Load pre-trained models
38
+ self.bert_model = BertModel.from_pretrained('bert-base-uncased')
39
+ self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
40
+ self.sts_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
41
+
42
+ # Calculate original embeddings
43
+ self.original_embedding = self._get_sentence_embedding(self.original_sentence)
44
+ self.sts_original_embedding = self.sts_model.encode(self.original_sentence)
45
+
46
+ def calculate_all_metrics(self):
47
+ """
48
+ Calculate all detectability metrics for each paraphrased sentence.
49
+ """
50
+ for idx, paraphrased_sentence in enumerate(self.paraphrased_sentences):
51
+ key = f"Sentence_{idx + 1}"
52
+ self.metrics['BLEU Score'][key] = self._calculate_bleu(self.original_sentence, paraphrased_sentence)
53
+ paraphrase_embedding = self._get_sentence_embedding(paraphrased_sentence)
54
+ self.metrics['Cosine Similarity'][key] = cosine_similarity([self.original_embedding], [paraphrase_embedding])[0][0]
55
+ sts_paraphrase_embedding = self.sts_model.encode(paraphrased_sentence)
56
+ self.metrics['STS Score'][key] = cosine_similarity([self.sts_original_embedding], [sts_paraphrase_embedding])[0][0]
57
+
58
+ def normalize_metrics(self):
59
+ """
60
+ Normalize all metrics to be between 0 and 1.
61
+ """
62
+ for metric_name, metric_dict in self.metrics.items():
63
+ self.normalized_metrics[metric_name] = self._normalize_dict(metric_dict)
64
+
65
+ def calculate_combined_detectability(self):
66
+ """
67
+ Calculate the combined detectability using the root mean square of the normalized metrics.
68
+ """
69
+ for key in self.normalized_metrics['BLEU Score'].keys():
70
+ rms = np.sqrt(sum(
71
+ self.normalized_metrics[metric][key] ** 2 for metric in self.normalized_metrics
72
+ ) / len(self.normalized_metrics))
73
+ self.combined_detectabilities[key] = rms
74
+
75
+ def plot_metrics(self):
76
+ """
77
+ Plot each normalized metric and the combined detectability in separate graphs.
78
+ """
79
+ keys = list(self.normalized_metrics['BLEU Score'].keys())
80
+ indices = np.arange(len(keys))
81
+
82
+ # Prepare data for plotting
83
+ metrics = {name: [self.normalized_metrics[name][key] for key in keys] for name in self.normalized_metrics}
84
+
85
+ # Plot each metric separately
86
+ for metric_name, values in metrics.items():
87
+ plt.figure(figsize=(12, 6))
88
+ plt.plot(indices, values, marker='o', color=np.random.rand(3,))
89
+ plt.xlabel('Sentence Index')
90
+ plt.ylabel('Normalized Value (0-1)')
91
+ plt.title(f'Normalized {metric_name}')
92
+ plt.grid(True)
93
+ plt.tight_layout()
94
+ plt.show()
95
+
96
+ # Private methods for metric calculations
97
+ def _calculate_bleu(self, reference, candidate):
98
+ """
99
+ Calculate the BLEU score between the original and paraphrased sentence using smoothing.
100
+ """
101
+ reference_tokens = nltk.word_tokenize(reference)
102
+ candidate_tokens = nltk.word_tokenize(candidate)
103
+ smoothing = SmoothingFunction().method1
104
+ return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing)
105
+
106
+ def _get_sentence_embedding(self, sentence):
107
+ """
108
+ Get sentence embedding using BERT.
109
+ """
110
+ tokens = self.bert_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
111
+ with torch.no_grad():
112
+ outputs = self.bert_model(**tokens)
113
+ return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
114
+
115
+ def _normalize_dict(self, metric_dict):
116
+ """
117
+ Normalize the values in a dictionary to be between 0 and 1.
118
+ """
119
+ values = np.array(list(metric_dict.values()))
120
+ min_val = values.min()
121
+ max_val = values.max()
122
+ # Avoid division by zero if all values are the same
123
+ return dict(zip(metric_dict.keys(), np.zeros_like(values) if max_val - min_val == 0 else (values - min_val) / (max_val - min_val)))
124
+
125
+ # Getter methods
126
+ def get_normalized_metrics(self):
127
+ """
128
+ Get all normalized metrics as a dictionary.
129
+ """
130
+ return self.normalized_metrics
131
+
132
+ def get_combined_detectabilities(self):
133
+ """
134
+ Get the dictionary of combined detectability values.
135
+ """
136
+ return self.combined_detectabilities
137
+
138
+
139
+ # Example usage
140
+ if __name__ == "__main__":
141
+ # Original sentence
142
+ original_sentence = "The quick brown fox jumps over the lazy dog"
143
+
144
+ # Paraphrased sentences
145
+ paraphrased_sentences = [
146
+ # Original 1: "A swift auburn fox leaps across a sleepy canine."
147
+ "The swift auburn fox leaps across a sleepy canine.",
148
+ "A quick auburn fox leaps across a sleepy canine.",
149
+ "A swift ginger fox leaps across a sleepy canine.",
150
+ "A swift auburn fox bounds across a sleepy canine.",
151
+ "A swift auburn fox leaps across a tired canine.",
152
+ "Three swift auburn foxes leap across a sleepy canine.",
153
+ "The vulpine specimen rapidly traverses over a dormant dog.",
154
+ "Like lightning, the russet hunter soars over the drowsy guardian.",
155
+ "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
156
+ "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
157
+ "A swift auburn predator navigates across a lethargic pet.",
158
+ "Subject A (fox) demonstrates velocity over Subject B (dog).",
159
+
160
+ # Original 2: "The agile russet fox bounds over an idle hound."
161
+ "Some agile russet foxes bound over an idle hound.",
162
+ "The nimble russet fox bounds over an idle hound.",
163
+ "The agile brown fox bounds over an idle hound.",
164
+ "The agile russet fox jumps over an idle hound.",
165
+ "The agile russet fox bounds over a lazy hound.",
166
+ "Two agile russet foxes bound over an idle hound.",
167
+ "A dexterous vulpine surpasses a stationary canine.",
168
+ "Quick as thought, the copper warrior sails over the guardian.",
169
+ "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
170
+ "A dexterous V. vulpes exceeds the plane of an inactive canine.",
171
+ "An agile russet hunter maneuvers above a resting hound.",
172
+ "Test subject F-1 achieves displacement superior to subject D-1.",
173
+
174
+ # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
175
+ "The nimble mahogany vulpine vaults above a drowsy dog.",
176
+ "A swift mahogany vulpine vaults above a drowsy dog.",
177
+ "A nimble reddish vulpine vaults above a drowsy dog.",
178
+ "A nimble mahogany fox vaults above a drowsy dog.",
179
+ "A nimble mahogany vulpine leaps above a drowsy dog.",
180
+ "Four nimble mahogany vulpines vault above a drowsy dog.",
181
+ "An agile specimen of reddish fur surpasses a somnolent canine.",
182
+ "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
183
+ "Tha quick brown beastie jumps o'er the tired pup, aye.",
184
+ "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
185
+ "A nimble rust-colored predator crosses above a drowsy pet.",
186
+ "Observed: Subject Red executes vertical motion over Subject Gray.",
187
+
188
+ # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
189
+ "A speedy copper-colored fox hops over the lethargic pup.",
190
+ "The quick copper-colored fox hops over the lethargic pup.",
191
+ "The speedy bronze fox hops over the lethargic pup.",
192
+ "The speedy copper-colored fox jumps over the lethargic pup.",
193
+ "The speedy copper-colored fox hops over the tired pup.",
194
+ "Multiple speedy copper-colored foxes hop over the lethargic pup.",
195
+ "A rapid vulpine of bronze hue traverses an inactive young canine.",
196
+ "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
197
+ "Tha fast copper beastie leaps o'er the sleepy wee dog.",
198
+ "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
199
+ "A fleet copper-toned predator moves past a sluggish young dog.",
200
+ "Field note: Adult fox subject exceeds puppy subject vertically.",
201
+
202
+ # Original 5: "A rapid tawny fox springs over a sluggish dog."
203
+ "The rapid tawny fox springs over a sluggish dog.",
204
+ "A quick tawny fox springs over a sluggish dog.",
205
+ "A rapid golden fox springs over a sluggish dog.",
206
+ "A rapid tawny fox jumps over a sluggish dog.",
207
+ "A rapid tawny fox springs over a lazy dog.",
208
+ "Six rapid tawny foxes spring over a sluggish dog.",
209
+ "An expeditious yellowish vulpine surpasses a torpid canine.",
210
+ "Fast as a bullet, the golden hunter vaults over the idle guard.",
211
+ "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
212
+ "One V. vulpes displays rapid transit over one inactive C. familiaris.",
213
+ "A speedy yellow-brown predator bypasses a motionless dog.",
214
+ "Log entry: Vulpine subject achieves swift vertical displacement.",
215
+
216
+ # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
217
+ "A fleet-footed chestnut fox soars above an indolent canine.",
218
+ "The swift chestnut fox soars above an indolent canine.",
219
+ "The fleet-footed brown fox soars above an indolent canine.",
220
+ "The fleet-footed chestnut fox leaps above an indolent canine.",
221
+ "The fleet-footed chestnut fox soars above a lazy canine.",
222
+ "Several fleet-footed chestnut foxes soar above an indolent canine.",
223
+ "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
224
+ "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
225
+ "Tha quick brown beastie sails o'er the sleepy hound, ken.",
226
+ "Single agile V. vulpes achieves elevation above stationary canine.",
227
+ "A nimble brown predator glides over an unmoving domestic animal.",
228
+ "Research note: Brown subject displays superior vertical mobility.",
229
+
230
+ # Original 7: "A fast ginger fox hurdles past a slothful dog."
231
+ "The fast ginger fox hurdles past a slothful dog.",
232
+ "A quick ginger fox hurdles past a slothful dog.",
233
+ "A fast red fox hurdles past a slothful dog.",
234
+ "A fast ginger fox jumps past a slothful dog.",
235
+ "A fast ginger fox hurdles past a lazy dog.",
236
+ "Five fast ginger foxes hurdle past a slothful dog.",
237
+ "A rapid orange vulpine bypasses a lethargic canine.",
238
+ "Quick as lightning, the flame-colored hunter races past the lazy guard.",
239
+ "Tha swift ginger beastie leaps past the tired doggy, ye see.",
240
+ "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
241
+ "A speedy red-orange predator overtakes a motionless dog.",
242
+ "Data point: Orange subject demonstrates rapid transit past Gray subject.",
243
+
244
+ # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
245
+ "A spry rusty-colored fox jumps across a dozing hound.",
246
+ "The agile rusty-colored fox jumps across a dozing hound.",
247
+ "The spry reddish fox jumps across a dozing hound.",
248
+ "The spry rusty-colored fox leaps across a dozing hound.",
249
+ "The spry rusty-colored fox jumps across a sleeping hound.",
250
+ "Multiple spry rusty-colored foxes jump across a dozing hound.",
251
+ "An agile rust-toned vulpine traverses a somnolent canine.",
252
+ "Nimble as thought, the copper hunter bounds over the resting guard.",
253
+ "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
254
+ "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
255
+ "A lithe rust-tinted predator moves past a slumbering dog.",
256
+ "Observation: Russet subject exhibits agility over dormant subject.",
257
+
258
+ # Original 9: "A quick tan fox leaps over an inactive dog."
259
+ "The quick tan fox leaps over an inactive dog.",
260
+ "A swift tan fox leaps over an inactive dog.",
261
+ "A quick beige fox leaps over an inactive dog.",
262
+ "A quick tan fox jumps over an inactive dog.",
263
+ "A quick tan fox leaps over a motionless dog.",
264
+ "Seven quick tan foxes leap over an inactive dog.",
265
+ "A rapid light-brown vulpine surpasses a stationary canine.",
266
+ "Fast as wind, the sand-colored hunter soars over the still guard.",
267
+ "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
268
+ "One agile fawn V. vulpes traverses one immobile C. familiaris.",
269
+ "A fleet tan-colored predator bypasses an unmoving dog.",
270
+ "Field report: Tan subject demonstrates movement over static subject.",
271
+
272
+ # Original 10: "The brisk auburn vulpine bounces over a listless canine."
273
+ "Some brisk auburn vulpines bounce over a listless canine.",
274
+ "The quick auburn vulpine bounces over a listless canine.",
275
+ "The brisk russet vulpine bounces over a listless canine.",
276
+ "The brisk auburn fox bounces over a listless canine.",
277
+ "The brisk auburn vulpine jumps over a listless canine.",
278
+ "Five brisk auburn vulpines bounce over a listless canine.",
279
+ "The expeditious specimen supersedes a quiescent Canis lupus.",
280
+ "Swift as wind, the russet hunter vaults over the idle guardian.",
281
+ "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
282
+ "One V. vulpes achieves displacement over inactive C. familiaris.",
283
+ "A high-velocity auburn predator traverses an immobile animal.",
284
+ "Final observation: Red subject shows mobility over Gray subject."
285
+ ]
286
+
287
+ # Create the calculator instance
288
+ calculator = SentenceDetectabilityCalculator(original_sentence, paraphrased_sentences)
289
+
290
+ # Calculate metrics
291
+ calculator.calculate_all_metrics()
292
+ calculator.normalize_metrics()
293
+ calculator.calculate_combined_detectability()
294
+
295
+ # Plot metrics
296
+ calculator.plot_metrics()
297
+
298
+ # Get results
299
+ normalized_metrics = calculator.get_normalized_metrics()
300
+ combined_detectabilities = calculator.get_combined_detectabilities()
301
+
302
+ print("Normalized Metrics:", normalized_metrics)
303
+ print("Combined Detectabilities:", combined_detectabilities)
distortion.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import nltk
3
+ import numpy as np
4
+ import torch
5
+ import matplotlib.pyplot as plt
6
+ from scipy.special import rel_entr
7
+ from collections import Counter
8
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
9
+
10
+ # Download NLTK data if not already present
11
+ nltk.download('punkt', quiet=True)
12
+
13
+ class SentenceDistortionCalculator:
14
+ """
15
+ A class to calculate and analyze distortion metrics between an original sentence and modified sentences.
16
+ """
17
+
18
+ def __init__(self, original_sentence, modified_sentences):
19
+ self.original_sentence = original_sentence
20
+ self.modified_sentences = modified_sentences
21
+ self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
22
+ self.model = GPT2LMHeadModel.from_pretrained("gpt2").eval() # Set model to evaluation mode
23
+
24
+ # Raw metric dictionaries
25
+ self.metrics = {
26
+ 'levenshtein': {},
27
+ 'word_level_changes': {},
28
+ 'kl_divergences': {},
29
+ 'perplexities': {},
30
+ }
31
+
32
+ # Combined distortion dictionary
33
+ self.combined_distortions = {}
34
+
35
+ def calculate_all_metrics(self):
36
+ """Calculate all distortion metrics for each modified sentence."""
37
+ for idx, modified_sentence in enumerate(self.modified_sentences):
38
+ key = f"Sentence_{idx + 1}"
39
+ self.metrics['levenshtein'][key] = self._calculate_levenshtein_distance(modified_sentence)
40
+ self.metrics['word_level_changes'][key] = self._calculate_word_level_change(modified_sentence)
41
+ self.metrics['kl_divergences'][key] = self._calculate_kl_divergence(modified_sentence)
42
+ self.metrics['perplexities'][key] = self._calculate_perplexity(modified_sentence)
43
+
44
+ def normalize_metrics(self):
45
+ """Normalize all metrics to be between 0 and 1."""
46
+ for metric in self.metrics:
47
+ self.metrics[metric] = self._normalize_dict(self.metrics[metric])
48
+
49
+ def calculate_combined_distortion(self):
50
+ """Calculate the combined distortion using the root mean square of the normalized metrics."""
51
+ for key in self.metrics['levenshtein']:
52
+ rms = np.sqrt(sum(self.metrics[metric][key] ** 2 for metric in self.metrics) / len(self.metrics))
53
+ self.combined_distortions[key] = rms
54
+
55
+ def plot_metrics(self):
56
+ """Plot each normalized metric and the combined distortion in separate graphs."""
57
+ keys = list(self.metrics['levenshtein'].keys())
58
+ indices = np.arange(len(keys))
59
+
60
+ for metric_name, values in self.metrics.items():
61
+ plt.figure(figsize=(12, 6))
62
+ plt.plot(indices, list(values.values()), marker='o', label=metric_name)
63
+ plt.xlabel('Sentence Index')
64
+ plt.ylabel('Normalized Value (0-1)')
65
+ plt.title(f'Normalized {metric_name.replace("_", " ").title()}')
66
+ plt.grid(True)
67
+ plt.legend()
68
+ plt.tight_layout()
69
+ plt.show()
70
+
71
+ # Private methods for metric calculations
72
+ def _calculate_levenshtein_distance(self, modified_sentence):
73
+ """Calculate the Levenshtein Distance between the original and modified sentence."""
74
+ return nltk.edit_distance(self.original_sentence, modified_sentence)
75
+
76
+ def _calculate_word_level_change(self, modified_sentence):
77
+ """Calculate the proportion of word-level changes between the original and modified sentence."""
78
+ original_words = self.original_sentence.split()
79
+ modified_words = modified_sentence.split()
80
+ total_words = max(len(original_words), len(modified_words))
81
+ changed_words = sum(o != m for o, m in zip(original_words, modified_words)) + abs(len(original_words) - len(modified_words))
82
+ return changed_words / total_words if total_words > 0 else 0
83
+
84
+ def _calculate_kl_divergence(self, modified_sentence):
85
+ """Calculate the KL Divergence between the word distributions of the original and modified sentence."""
86
+ original_counts = Counter(self.original_sentence.lower().split())
87
+ modified_counts = Counter(modified_sentence.lower().split())
88
+ all_words = set(original_counts.keys()).union(modified_counts.keys())
89
+
90
+ original_probs = np.array([original_counts[word] for word in all_words], dtype=float)
91
+ modified_probs = np.array([modified_counts[word] for word in all_words], dtype=float)
92
+
93
+ original_probs /= original_probs.sum() + 1e-10 # Avoid division by zero
94
+ modified_probs /= modified_probs.sum() + 1e-10
95
+
96
+ return np.sum(rel_entr(original_probs, modified_probs))
97
+
98
+ def _calculate_perplexity(self, sentence):
99
+ """Calculate the perplexity of a sentence using GPT-2."""
100
+ encodings = self.tokenizer(sentence, return_tensors='pt')
101
+ stride = self.model.config.n_positions
102
+ log_likelihoods = []
103
+
104
+ for i in range(0, encodings.input_ids.size(1), stride):
105
+ input_ids = encodings.input_ids[:, i:i + stride]
106
+ with torch.no_grad():
107
+ outputs = self.model(input_ids, labels=input_ids)
108
+ log_likelihoods.append(outputs.loss.item())
109
+
110
+ avg_log_likelihood = np.mean(log_likelihoods)
111
+ return torch.exp(torch.tensor(avg_log_likelihood)).item()
112
+
113
+ def _normalize_dict(self, metric_dict):
114
+ """Normalize the values in a dictionary to be between 0 and 1."""
115
+ values = np.array(list(metric_dict.values()))
116
+ min_val, max_val = values.min(), values.max()
117
+ normalized_values = (values - min_val) / (max_val - min_val) if max_val > min_val else np.zeros_like(values)
118
+ return dict(zip(metric_dict.keys(), normalized_values))
119
+
120
+ def get_normalized_metrics(self):
121
+ """Get all normalized metrics as a dictionary."""
122
+ return {metric: self._normalize_dict(values) for metric, values in self.metrics.items()}
123
+
124
+ def get_combined_distortions(self):
125
+ """Get the dictionary of combined distortion values."""
126
+ return self.combined_distortions
entailment.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ def analyze_entailment(original_sentence, paraphrased_sentences, threshold):
4
+ # Load the entailment model once
5
+ entailment_pipe = pipeline("text-classification", model="ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli")
6
+
7
+ all_sentences = {}
8
+ selected_sentences = {}
9
+ discarded_sentences = {}
10
+
11
+ # Prepare input for entailment checks
12
+ inputs = [f"{original_sentence} [SEP] {paraphrase}" for paraphrase in paraphrased_sentences]
13
+
14
+ # Perform entailment checks for all paraphrased sentences in one go
15
+ entailment_results = entailment_pipe(inputs, return_all_scores=True)
16
+
17
+ # Iterate over results
18
+ for paraphrased_sentence, results in zip(paraphrased_sentences, entailment_results):
19
+ # Extract the entailment score for each paraphrased sentence
20
+ entailment_score = next((result['score'] for result in results if result['label'] == 'entailment'), 0)
21
+
22
+ all_sentences[paraphrased_sentence] = entailment_score
23
+
24
+ # Store sentences based on the threshold
25
+ if entailment_score >= threshold:
26
+ selected_sentences[paraphrased_sentence] = entailment_score
27
+ else:
28
+ discarded_sentences[paraphrased_sentence] = entailment_score
29
+
30
+ return all_sentences, selected_sentences, discarded_sentences
31
+
32
+ # Example usage
33
+ # print(analyze_entailment("I love you", ["I adore you", "I hate you"], 0.7))
euclidean_distance.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import euclidean_distances
6
+
7
+ class SentenceEuclideanDistanceCalculator:
8
+ """
9
+ A class to calculate and analyze Euclidean distance between an original sentence and paraphrased sentences.
10
+ """
11
+
12
+ def __init__(self, original_sentence, paraphrased_sentences):
13
+ """
14
+ Initialize the calculator with the original sentence and a list of paraphrased sentences.
15
+ """
16
+ self.original_sentence = original_sentence
17
+ self.paraphrased_sentences = paraphrased_sentences
18
+
19
+ # Load SentenceTransformer model for embedding calculation
20
+ self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
21
+
22
+ # Precompute the original sentence embedding
23
+ self.original_embedding = self.model.encode(original_sentence, convert_to_tensor=True)
24
+
25
+ # Calculate Euclidean distances and normalize them
26
+ self.euclidean_distances = self._calculate_all_metrics()
27
+ self.normalized_euclidean = self._normalize_dict(self.euclidean_distances)
28
+
29
+ def _calculate_all_metrics(self):
30
+ """
31
+ Calculate Euclidean distance between the original and each paraphrased sentence.
32
+ """
33
+ distances = {}
34
+ paraphrase_embeddings = self.model.encode(self.paraphrased_sentences, convert_to_tensor=True)
35
+
36
+ for idx, paraphrase_embedding in enumerate(paraphrase_embeddings):
37
+ key = f"Sentence_{idx + 1}"
38
+ distances[key] = euclidean_distances([self.original_embedding], [paraphrase_embedding])[0][0]
39
+
40
+ return distances
41
+
42
+ def _normalize_dict(self, metric_dict):
43
+ """
44
+ Normalize the values in a dictionary to be between 0 and 1.
45
+ """
46
+ values = np.array(list(metric_dict.values()))
47
+ min_val, max_val = values.min(), values.max()
48
+
49
+ # Normalize values
50
+ normalized_values = (values - min_val) / (max_val - min_val) if max_val > min_val else np.zeros_like(values)
51
+ return dict(zip(metric_dict.keys(), normalized_values))
52
+
53
+ def plot_metrics(self):
54
+ """
55
+ Plot the normalized Euclidean distances in a graph.
56
+ """
57
+ keys = list(self.normalized_euclidean.keys())
58
+ indices = np.arange(len(keys))
59
+
60
+ plt.figure(figsize=(12, 6))
61
+ plt.plot(indices, [self.normalized_euclidean[key] for key in keys], marker='o', color=np.random.rand(3,))
62
+ plt.xlabel('Sentence Index')
63
+ plt.ylabel('Normalized Euclidean Distance (0-1)')
64
+ plt.title('Normalized Euclidean Distance')
65
+ plt.grid(True)
66
+ plt.tight_layout()
67
+ plt.show()
68
+
69
+ # Getter methods
70
+ def get_normalized_metrics(self):
71
+ """
72
+ Get the normalized Euclidean distances as a dictionary.
73
+ """
74
+ return self.normalized_euclidean
gpt_mask_filling.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ openai.api_key = os.getenv("API_KEY")
8
+
9
+
10
+ #Takes in a sentence and returns a list of dicts consisiting of key-value pairs of masked words and lists of the possible replacements
11
+ def predict_masked_words(sentence, n_suggestions=5):
12
+
13
+ prompt = (
14
+ f"Given a sentence with masked words, masked word can be one or more than one, indicated by [MASK], generate {n_suggestions} possible words to fill each mask. "
15
+ "Return the results as a list of dictionaries, where each dictionary key is a masked word and its value is a list of 5 potential words to fill that mask.\n\n"
16
+ "Example input: \"The [MASK] fox [MASK] over the [MASK] dog.\"\n\n"
17
+ "Example output:\n"
18
+ "[\n"
19
+ " {\n"
20
+ " \"[MASK]1\": [\"quick\", \"sly\", \"red\", \"clever\", \"sneaky\"]\n"
21
+ " },\n"
22
+ " {\n"
23
+ " \"[MASK]2\": [\"jumped\", \"leaped\", \"hopped\", \"sprang\", \"bounded\"]\n"
24
+ " },\n"
25
+ " {\n"
26
+ " \"[MASK]3\": [\"lazy\", \"sleeping\", \"brown\", \"tired\", \"old\"]\n"
27
+ " }\n"
28
+ "]\n\n"
29
+ "Example input: \"The [MASK] [MASK] ran swiftly across the [MASK] field.\"\n\n"
30
+ "Example output:\n"
31
+ "[\n"
32
+ " {\n"
33
+ " \"[MASK]1\": [\"tall\", \"fierce\", \"young\", \"old\", \"beautiful\"]\n"
34
+ " },\n"
35
+ " {\n"
36
+ " \"[MASK]2\": [\"lion\", \"tiger\", \"horse\", \"cheetah\", \"deer\"]\n"
37
+ " },\n"
38
+ " {\n"
39
+ " \"[MASK]3\": [\"green\", \"wide\", \"sunny\", \"open\", \"empty\"]\n"
40
+ " }\n"
41
+ "]\n\n"
42
+ "Example input: \"It was a [MASK] day when the train arrived at the station.\"\n\n"
43
+ "Example output:\n"
44
+ "[\n"
45
+ " {\n"
46
+ " \"[MASK]1\": [\"sunny\", \"rainy\", \"cloudy\", \"foggy\", \"stormy\"]\n"
47
+ " },\n"
48
+ "]\n\n"
49
+ "Now, please process the following sentence:\n"
50
+ f"{sentence}"
51
+ )
52
+
53
+
54
+ response = openai.ChatCompletion.create(
55
+ model="gpt-3.5-turbo",
56
+ messages=[
57
+ {"role": "system", "content": "You are a helpful assistant."},
58
+ {"role": "user", "content": prompt}
59
+ ],
60
+ max_tokens=100,
61
+ n=1,
62
+ stop=None,
63
+ temperature=0.7
64
+ )
65
+
66
+ print(response['choices'][0]['message']['content'])
67
+
68
+
69
+ # sentence = "Evacuations and storm [MASK] began on Sunday night as forecasters projected that Hurricane Dorian would hit into Florida’s west coast on Wednesday as a major hurricane packing life-threatening winds and storm surge."
70
+ # predict_masked_words(sentence, n_suggestions=5)
highlighter.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def highlight_common_words(common_words, sentences, title):
4
+ color_map = {}
5
+ highlighted_html = []
6
+
7
+ for idx, sentence in enumerate(sentences, start=1):
8
+ highlighted_sentence = f"{idx}. {sentence}"
9
+
10
+ for index, word in common_words:
11
+ if word not in color_map:
12
+ # Assign color using HSL for better visual distinction
13
+ color_map[word] = f'hsl({(len(color_map) % 6) * 60}, 70%, 80%)'
14
+
15
+ # Create a regex pattern for the word
16
+ escaped_word = re.escape(word)
17
+ pattern = rf'\b{escaped_word}\b'
18
+ color = color_map[word]
19
+
20
+ # Use a lambda function for word highlighting
21
+ highlighted_sentence = re.sub(
22
+ pattern,
23
+ lambda m: (f'<span style="background-color: {color}; font-weight: bold;'
24
+ ' padding: 2px 4px; border-radius: 2px; position: relative;">'
25
+ f'<span style="background-color: black; color: white; border-radius: 50%;'
26
+ ' padding: 2px 5px; margin-right: 5px;">{index}</span>'
27
+ f'{m.group(0)}'
28
+ '</span>'),
29
+ highlighted_sentence,
30
+ flags=re.IGNORECASE
31
+ )
32
+
33
+ highlighted_html.append(highlighted_sentence)
34
+
35
+ # Construct the final HTML output
36
+ return generate_html(title, highlighted_html)
37
+
38
+ def highlight_common_words_dict(common_words, sentences, title):
39
+ color_map = {}
40
+ highlighted_html = []
41
+
42
+ for idx, (sentence, score) in enumerate(sentences.items(), start=1):
43
+ highlighted_sentence = f"{idx}. {sentence}"
44
+
45
+ for index, word in common_words:
46
+ if word not in color_map:
47
+ color_map[word] = f'hsl({(len(color_map) % 6) * 60}, 70%, 80%)'
48
+ escaped_word = re.escape(word)
49
+ pattern = rf'\b{escaped_word}\b'
50
+ color = color_map[word]
51
+
52
+ highlighted_sentence = re.sub(
53
+ pattern,
54
+ lambda m: (f'<span style="background-color: {color}; font-weight: bold;'
55
+ ' padding: 1px 2px; border-radius: 2px; position: relative;">'
56
+ f'<span style="background-color: black; color: white; border-radius: 50%;'
57
+ ' padding: 1px 3px; margin-right: 3px; font-size: 0.8em;">{index}</span>'
58
+ f'{m.group(0)}'
59
+ '</span>'),
60
+ highlighted_sentence,
61
+ flags=re.IGNORECASE
62
+ )
63
+
64
+ highlighted_html.append(
65
+ f'<div style="margin-bottom: 5px;">'
66
+ f'{highlighted_sentence}'
67
+ f'<div style="display: inline-block; margin-left: 5px; padding: 3px 5px; border-radius: 3px;'
68
+ ' background-color: white; font-size: 0.9em;">Entailment Score: {score}</div></div>'
69
+ )
70
+
71
+ return generate_html(title, highlighted_html)
72
+
73
+ def generate_html(title, highlighted_html):
74
+ final_html = "<br><br>".join(highlighted_html)
75
+ return f'''
76
+ <div style="border: solid 1px #ccc; padding: 16px; background-color: #FFFFFF; color: #374151;
77
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
78
+ <h3 style="margin-top: 0; font-size: 1em; color: #111827;">{title}</h3>
79
+ <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
80
+ </div>
81
+ '''
82
+
83
+ def reparaphrased_sentences_html(sentences):
84
+ formatted_sentences = [f"{idx + 1}. {sentence}" for idx, sentence in enumerate(sentences)]
85
+ final_html = "<br><br>".join(formatted_sentences)
86
+
87
+ return f'''
88
+ <div style="border: solid 1px #ccc; padding: 16px; background-color: #FFFFFF; color: #374151;
89
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
90
+ <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
91
+ </div>
92
+ '''
lcs.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from nltk.corpus import stopwords
3
+
4
+ def find_common_subsequences(sentence, str_list):
5
+ # Load stop words
6
+ stop_words = set(stopwords.words('english'))
7
+
8
+ # Preprocess the input sentence and list of strings
9
+ sentence = sentence.lower()
10
+ cleaned_str_list = [s.lower() for s in str_list]
11
+
12
+ def clean_text(text):
13
+ """Remove stop words and special characters from a given text."""
14
+ text = re.sub(r'[^\w\s]', '', text)
15
+ return " ".join(word for word in text.split() if word not in stop_words)
16
+
17
+ cleaned_sentence = clean_text(sentence)
18
+ cleaned_str_list = [clean_text(s) for s in cleaned_str_list]
19
+
20
+ words = cleaned_sentence.split()
21
+ common_grams = []
22
+ added_phrases = set()
23
+
24
+ for n in range(5, 0, -1): # Check n-grams from size 5 to 1
25
+ for i in range(len(words) - n + 1):
26
+ subseq = " ".join(words[i:i + n])
27
+ if is_present(subseq, cleaned_str_list) and subseq not in added_phrases:
28
+ common_grams.append((i, subseq))
29
+ added_phrases.add(subseq)
30
+
31
+ # Sort by the first appearance in the original sentence and create indexed common grams
32
+ common_grams.sort(key=lambda x: x[0])
33
+ return [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)]
34
+
35
+ def is_present(subseq, str_list):
36
+ """Check if a subsequence is present in all strings in the list."""
37
+ subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
38
+ return all(subseq_regex.search(s) for s in str_list)
39
+
40
+ def find_common_gram_positions(str_list, common_grams):
41
+ """Find positions of common grams in each string from str_list."""
42
+ positions = []
43
+
44
+ for sentence in str_list:
45
+ words = re.sub(r'[^\w\s]', '', sentence).lower().split()
46
+ word_positions = {word: [] for word in words}
47
+
48
+ for idx, word in enumerate(words):
49
+ word_positions[word].append(idx + 1) # Store 1-based index positions
50
+
51
+ sentence_positions = []
52
+ for _, gram in common_grams:
53
+ gram_words = re.sub(r'[^\w\s]', '', gram).lower().split()
54
+
55
+ if all(word in word_positions for word in gram_words):
56
+ start_idx = word_positions[gram_words[0]][0]
57
+ sentence_positions.append(start_idx)
58
+ else:
59
+ sentence_positions.append(-1) # Common gram not found
60
+
61
+ positions.append(sentence_positions)
62
+
63
+ return positions
masking_methods.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
3
+ import random
4
+ from nltk.corpus import stopwords
5
+ import nltk
6
+ from vocabulary_split import split_vocabulary, filter_logits
7
+
8
+ # Load tokenizer and model for masked language model
9
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
10
+ model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
11
+ fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
12
+
13
+ # Get permissible vocabulary
14
+ permissible, _ = split_vocabulary(seed=42)
15
+ permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
16
+
17
+ # Initialize stop words and ensure NLTK resources are downloaded
18
+ stop_words = set(stopwords.words('english'))
19
+ nltk.download('averaged_perceptron_tagger', quiet=True)
20
+ nltk.download('maxent_ne_chunker', quiet=True)
21
+ nltk.download('words', quiet=True)
22
+
23
+ def get_logits_for_mask(sentence):
24
+ inputs = tokenizer(sentence, return_tensors="pt")
25
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
26
+
27
+ with torch.no_grad():
28
+ outputs = model(**inputs)
29
+
30
+ logits = outputs.logits
31
+ return logits[0, mask_token_index, :].squeeze()
32
+
33
+ def mask_word(sentence, word):
34
+ masked_sentence = sentence.replace(word, '[MASK]', 1)
35
+ logits = get_logits_for_mask(masked_sentence)
36
+ filtered_logits = filter_logits(logits, permissible_indices)
37
+ words = [tokenizer.decode([i]) for i in filtered_logits.argsort()[-5:]]
38
+ return masked_sentence, filtered_logits.tolist(), words
39
+
40
+ def mask_non_stopword(sentence, pseudo_random=False):
41
+ non_stop_words = [word for word in sentence.split() if word.lower() not in stop_words]
42
+ if not non_stop_words:
43
+ return sentence, None, None
44
+
45
+ if pseudo_random:
46
+ random.seed(10) # Fixed seed for pseudo-randomness
47
+ word_to_mask = random.choice(non_stop_words)
48
+ return mask_word(sentence, word_to_mask)
49
+
50
+ def mask_between_lcs(sentence, lcs_points):
51
+ words = sentence.split()
52
+ masked_indices = []
53
+
54
+ # Mask first word before the first LCS point
55
+ if lcs_points and lcs_points[0] > 0:
56
+ idx = random.randint(0, lcs_points[0] - 1)
57
+ words[idx] = '[MASK]'
58
+ masked_indices.append(idx)
59
+
60
+ # Mask between LCS points
61
+ for i in range(len(lcs_points) - 1):
62
+ start, end = lcs_points[i], lcs_points[i + 1]
63
+ if end - start > 1:
64
+ mask_index = random.randint(start + 1, end - 1)
65
+ words[mask_index] = '[MASK]'
66
+ masked_indices.append(mask_index)
67
+
68
+ # Mask last word after the last LCS point
69
+ if lcs_points and lcs_points[-1] < len(words) - 1:
70
+ idx = random.randint(lcs_points[-1] + 1, len(words) - 1)
71
+ words[idx] = '[MASK]'
72
+ masked_indices.append(idx)
73
+
74
+ masked_sentence = ' '.join(words)
75
+ logits = get_logits_for_mask(masked_sentence)
76
+
77
+ logits_list, top_words_list = [], []
78
+ for idx in masked_indices:
79
+ filtered_logits = filter_logits(logits[idx], permissible_indices)
80
+ logits_list.append(filtered_logits.tolist())
81
+ top_words = [tokenizer.decode([i]) for i in filtered_logits.topk(5).indices.tolist()]
82
+ top_words_list.append(top_words)
83
+
84
+ return masked_sentence, logits_list, top_words_list
85
+
86
+ def high_entropy_words(sentence, non_melting_points):
87
+ non_melting_words = {word.lower() for _, point in non_melting_points for word in point.split()}
88
+ candidate_words = [word for word in sentence.split() if word.lower() not in stop_words and word.lower() not in non_melting_words]
89
+
90
+ if not candidate_words:
91
+ return sentence, None, None
92
+
93
+ max_entropy, max_entropy_word, max_logits = -float('inf'), None, None
94
+ for word in candidate_words:
95
+ masked_sentence = sentence.replace(word, '[MASK]', 1)
96
+ logits = get_logits_for_mask(masked_sentence)
97
+ filtered_logits = filter_logits(logits, permissible_indices)
98
+
99
+ # Calculate entropy
100
+ probs = torch.softmax(filtered_logits, dim=-1)
101
+ top_5_probs = probs.topk(5).values
102
+ entropy = -torch.sum(top_5_probs * torch.log(top_5_probs + 1e-10)) # Avoid log(0)
103
+
104
+ if entropy > max_entropy:
105
+ max_entropy, max_entropy_word, max_logits = entropy, word, filtered_logits
106
+
107
+ if max_entropy_word is None:
108
+ return sentence, None, None
109
+
110
+ masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
111
+ words = [tokenizer.decode([i]) for i in max_logits.argsort()[-5:]]
112
+ return masked_sentence, max_logits.tolist(), words
113
+
114
+ def mask_by_pos(sentence, pos_to_mask=['NOUN', 'VERB', 'ADJ']):
115
+ words = nltk.word_tokenize(sentence)
116
+ pos_tags = nltk.pos_tag(words)
117
+
118
+ maskable_words = [word for word, pos in pos_tags if pos[:2] in pos_to_mask]
119
+ if not maskable_words:
120
+ return sentence, None, None
121
+
122
+ word_to_mask = random.choice(maskable_words)
123
+ return mask_word(sentence, word_to_mask)
124
+
125
+ def mask_named_entity(sentence):
126
+ words = nltk.word_tokenize(sentence)
127
+ pos_tags = nltk.pos_tag(words)
128
+ named_entities = nltk.ne_chunk(pos_tags)
129
+
130
+ maskable_words = [word for word, tag in named_entities.leaves() if isinstance(tag, nltk.Tree)]
131
+ if not maskable_words:
132
+ return sentence, None, None
133
+
134
+ word_to_mask = random.choice(maskable_words)
135
+ return mask_word(sentence, word_to_mask)
136
+
137
+
masking_methods_trial.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
3
+ from transformers import pipeline
4
+ import random
5
+ from nltk.corpus import stopwords
6
+ import nltk
7
+ nltk.download('stopwords')
8
+ import math
9
+ from vocabulary_split import split_vocabulary, filter_logits
10
+ import abc
11
+ from typing import List
12
+
13
+ # Load tokenizer and model for masked language model
14
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
15
+ model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
16
+ fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
17
+
18
+ # Get permissible vocabulary
19
+ permissible, _ = split_vocabulary(seed=42)
20
+ permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
21
+
22
+ def get_logits_for_mask(model, tokenizer, sentence):
23
+ inputs = tokenizer(sentence, return_tensors="pt")
24
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
25
+
26
+ with torch.no_grad():
27
+ outputs = model(**inputs)
28
+
29
+ logits = outputs.logits
30
+ mask_token_logits = logits[0, mask_token_index, :]
31
+ return mask_token_logits.squeeze()
32
+
33
+ # Abstract Masking Strategy
34
+ class MaskingStrategy(abc.ABC):
35
+ @abc.abstractmethod
36
+ def select_words_to_mask(self, words: List[str], **kwargs) -> List[int]:
37
+ """
38
+ Given a list of words, return the indices of words to mask.
39
+ """
40
+ pass
41
+
42
+ # Specific Masking Strategies
43
+ class RandomNonStopwordMasking(MaskingStrategy):
44
+ def __init__(self, num_masks: int = 1):
45
+ self.num_masks = num_masks
46
+ self.stop_words = set(stopwords.words('english'))
47
+
48
+ def select_words_to_mask(self, words: List[str], **kwargs) -> List[int]:
49
+ non_stop_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
50
+ if not non_stop_indices:
51
+ return []
52
+ num_masks = min(self.num_masks, len(non_stop_indices))
53
+ return random.sample(non_stop_indices, num_masks)
54
+
55
+ class HighEntropyMasking(MaskingStrategy):
56
+ def __init__(self, num_masks: int = 1):
57
+ self.num_masks = num_masks
58
+
59
+ def select_words_to_mask(self, words: List[str], sentence: str, model, tokenizer, permissible_indices) -> List[int]:
60
+ candidate_indices = [i for i, word in enumerate(words) if word.lower() not in set(stopwords.words('english'))]
61
+ if not candidate_indices:
62
+ return []
63
+
64
+ entropy_scores = {}
65
+ for idx in candidate_indices:
66
+ masked_sentence = ' '.join(words[:idx] + ['[MASK]'] + words[idx+1:])
67
+ logits = get_logits_for_mask(model, tokenizer, masked_sentence)
68
+ filtered_logits = filter_logits(logits, permissible_indices)
69
+ probs = torch.softmax(filtered_logits, dim=-1)
70
+ top_5_probs = probs.topk(5).values
71
+ entropy = -torch.sum(top_5_probs * torch.log(top_5_probs + 1e-10)).item()
72
+ entropy_scores[idx] = entropy
73
+
74
+ # Select top N indices with highest entropy
75
+ sorted_indices = sorted(entropy_scores, key=entropy_scores.get, reverse=True)
76
+ return sorted_indices[:self.num_masks]
77
+
78
+ class PseudoRandomNonStopwordMasking(MaskingStrategy):
79
+ def __init__(self, num_masks: int = 1, seed: int = 10):
80
+ self.num_masks = num_masks
81
+ self.seed = seed
82
+ self.stop_words = set(stopwords.words('english'))
83
+
84
+ def select_words_to_mask(self, words: List[str], **kwargs) -> List[int]:
85
+ non_stop_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
86
+ if not non_stop_indices:
87
+ return []
88
+ random.seed(self.seed)
89
+ num_masks = min(self.num_masks, len(non_stop_indices))
90
+ return random.sample(non_stop_indices, num_masks)
91
+
92
+ class CompositeMaskingStrategy(MaskingStrategy):
93
+ def __init__(self, strategies: List[MaskingStrategy]):
94
+ self.strategies = strategies
95
+
96
+ def select_words_to_mask(self, words: List[str], **kwargs) -> List[int]:
97
+ selected_indices = []
98
+ for strategy in self.strategies:
99
+ if isinstance(strategy, HighEntropyMasking):
100
+ selected = strategy.select_words_to_mask(words, **kwargs)
101
+ else:
102
+ selected = strategy.select_words_to_mask(words)
103
+ selected_indices.extend(selected)
104
+ return list(set(selected_indices)) # Remove duplicates
105
+
106
+ # Refactored mask_between_lcs function
107
+ def mask_between_lcs(sentence, lcs_points, masking_strategy: MaskingStrategy, model, tokenizer, permissible_indices):
108
+ words = sentence.split()
109
+ masked_indices = []
110
+
111
+ segments = []
112
+
113
+ # Define segments based on LCS points
114
+ previous = 0
115
+ for point in lcs_points:
116
+ if point > previous:
117
+ segments.append((previous, point))
118
+ previous = point + 1
119
+ if previous < len(words):
120
+ segments.append((previous, len(words)))
121
+
122
+ # Collect all indices to mask from each segment
123
+ for start, end in segments:
124
+ segment_words = words[start:end]
125
+ if isinstance(masking_strategy, HighEntropyMasking):
126
+ selected = masking_strategy.select_words_to_mask(segment_words, sentence, model, tokenizer, permissible_indices)
127
+ else:
128
+ selected = masking_strategy.select_words_to_mask(segment_words)
129
+
130
+ # Adjust indices relative to the whole sentence
131
+ for idx in selected:
132
+ masked_idx = start + idx
133
+ if masked_idx not in masked_indices:
134
+ masked_indices.append(masked_idx)
135
+
136
+ # Apply masking
137
+ for idx in masked_indices:
138
+ words[idx] = '[MASK]'
139
+
140
+ masked_sentence = ' '.join(words)
141
+ logits = get_logits_for_mask(model, tokenizer, masked_sentence)
142
+
143
+ # Process each masked token
144
+ top_words_list = []
145
+ logits_list = []
146
+ for i, idx in enumerate(masked_indices):
147
+ logits_i = logits[i]
148
+ if logits_i.dim() > 1:
149
+ logits_i = logits_i.squeeze()
150
+ filtered_logits_i = filter_logits(logits_i, permissible_indices)
151
+ logits_list.append(filtered_logits_i.tolist())
152
+ top_5_indices = filtered_logits_i.topk(5).indices.tolist()
153
+ top_words = [tokenizer.decode([i]) for i in top_5_indices]
154
+ top_words_list.append(top_words)
155
+
156
+ return masked_sentence, logits_list, top_words_list
157
+
158
+ # Example Usage
159
+ if __name__ == "__main__":
160
+ # Example sentence and LCS points
161
+ sentence = "This is a sample sentence with some LCS points"
162
+ lcs_points = [2, 5, 8] # Indices of LCS points
163
+
164
+ # Initialize masking strategies
165
+ random_non_stopword_strategy = RandomNonStopwordMasking(num_masks=1)
166
+ high_entropy_strategy = HighEntropyMasking(num_masks=1)
167
+ pseudo_random_strategy = PseudoRandomNonStopwordMasking(num_masks=1, seed=10)
168
+ composite_strategy = CompositeMaskingStrategy([
169
+ RandomNonStopwordMasking(num_masks=1),
170
+ HighEntropyMasking(num_masks=1)
171
+ ])
172
+
173
+ # Choose a strategy
174
+ chosen_strategy = composite_strategy # You can choose any initialized strategy
175
+
176
+ # Apply masking
177
+ masked_sentence, logits_list, top_words_list = mask_between_lcs(
178
+ sentence,
179
+ lcs_points,
180
+ masking_strategy=chosen_strategy,
181
+ model=model,
182
+ tokenizer=tokenizer,
183
+ permissible_indices=permissible_indices
184
+ )
185
+
186
+ print("Masked Sentence:", masked_sentence)
187
+ for idx, top_words in enumerate(top_words_list):
188
+ print(f"Top words for mask {idx+1}:", top_words)
paraphraser.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ # Load environment variables
6
+ load_dotenv()
7
+ key = os.getenv("OPENAI_API_KEY")
8
+
9
+ # Initialize the OpenAI client
10
+ client = OpenAI(api_key=key)
11
+
12
+ def generate_paraphrase(sentences, model="gpt-4", num_paraphrases=10, max_tokens=150, temperature=0.7):
13
+ """Generate paraphrased sentences using the OpenAI GPT-4 model."""
14
+
15
+ # Ensure sentences is a list
16
+ if isinstance(sentences, str):
17
+ sentences = [sentences]
18
+
19
+ paraphrased_sentences_list = []
20
+
21
+ for sentence in sentences:
22
+ full_prompt = f"Paraphrase the following text: '{sentence}'"
23
+
24
+ try:
25
+ chat_completion = client.chat.completions.create(
26
+ messages=[{"role": "user", "content": full_prompt}],
27
+ model=model,
28
+ max_tokens=max_tokens,
29
+ temperature=temperature,
30
+ n=num_paraphrases # Number of paraphrased sentences to generate
31
+ )
32
+ # Extract paraphrased sentences
33
+ paraphrased_sentences = [choice.message.content.strip() for choice in chat_completion.choices]
34
+ paraphrased_sentences_list.extend(paraphrased_sentences)
35
+ except Exception as e:
36
+ print(f"Error paraphrasing sentence '{sentence}': {e}")
37
+
38
+ return paraphrased_sentences_list
39
+
40
+ # Example usage
41
+ result = generate_paraphrase(
42
+ "Mayor Eric Adams did not attend the first candidate forum for the New York City mayoral race, but his record — and the criminal charges he faces — received plenty of attention on Saturday from the Democrats who are running to unseat him."
43
+ )
44
+
45
+ print(f"Number of paraphrases generated: {len(result)}")
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ipywidgets
2
+ transformers
3
+ plotly
4
+ requests
5
+ Pillow
6
+ numpy
7
+ matplotlib
8
+ tqdm
9
+ scipy
10
+ torch
11
+ seaborn
12
+ termcolor
13
+ nltk
14
+ tenacity
15
+ pandas
16
+ graphviz==0.20.3
17
+ gradio==4.29.0
18
+ openai
19
+ python-dotenv
20
+ scikit-learn
21
+ sentence-transformers
sampling_methods.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import random
3
+ from vocabulary_split import split_vocabulary, filter_logits
4
+ from masking_methods import tokenizer
5
+
6
+ # Get permissible vocabulary
7
+ permissible, _ = split_vocabulary(seed=42)
8
+ permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
9
+
10
+ def sample_word(sentence, words, logits, sampling_technique='inverse_transform', temperature=1.0):
11
+ # Convert logits to a tensor and filter based on permissible indices
12
+ filtered_logits = filter_logits(torch.tensor(logits), permissible_indices)
13
+ probs = torch.softmax(filtered_logits / temperature, dim=-1)
14
+
15
+ # Select sampling technique
16
+ if sampling_technique == 'inverse_transform':
17
+ cumulative_probs = torch.cumsum(probs, dim=-1)
18
+ random_prob = random.random()
19
+ sampled_index = torch.searchsorted(cumulative_probs, random_prob)
20
+ elif sampling_technique == 'exponential_minimum':
21
+ exp_probs = torch.exp(-torch.log(probs))
22
+ sampled_index = torch.argmax(random.rand_like(exp_probs) * exp_probs)
23
+ elif sampling_technique == 'temperature':
24
+ sampled_index = torch.multinomial(probs, 1).item()
25
+ elif sampling_technique == 'greedy':
26
+ sampled_index = torch.argmax(filtered_logits).item()
27
+ else:
28
+ raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
29
+
30
+ sampled_word = tokenizer.decode([sampled_index])
31
+
32
+ # Replace [MASK] with the sampled word
33
+ filled_sentence = sentence.replace('[MASK]', sampled_word)
34
+
35
+ return filled_sentence
scores.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from nltk.translate.bleu_score import sentence_bleu
4
+ from transformers import BertTokenizer, BertModel
5
+
6
+ # Function to Calculate the BLEU score
7
+ def calculate_bleu(reference, candidate):
8
+ return sentence_bleu([reference], candidate)
9
+
10
+ # Function to calculate BERT score
11
+ def calculate_bert(reference, candidate):
12
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
13
+ model = BertModel.from_pretrained('bert-base-uncased')
14
+
15
+ reference_tokens = tokenizer.tokenize(reference)
16
+ candidate_tokens = tokenizer.tokenize(candidate)
17
+
18
+ reference_ids = tokenizer.encode(reference, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
19
+ candidate_ids = tokenizer.encode(candidate, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
20
+
21
+ with torch.no_grad():
22
+ reference_outputs = model(reference_ids)
23
+ candidate_outputs = model(candidate_ids)
24
+
25
+ reference_embeddings = reference_outputs[0][:, 0, :].numpy()
26
+ candidate_embeddings = candidate_outputs[0][:, 0, :].numpy()
27
+
28
+ cosine_similarity = np.dot(reference_embeddings, candidate_embeddings.T) / (np.linalg.norm(reference_embeddings) * np.linalg.norm(candidate_embeddings))
29
+ return np.mean(cosine_similarity)
30
+
31
+ # Function to calculate minimum edit distance
32
+ def min_edit_distance(reference, candidate):
33
+ m = len(reference)
34
+ n = len(candidate)
35
+
36
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
37
+
38
+ for i in range(m + 1):
39
+ for j in range(n + 1):
40
+ if i == 0:
41
+ dp[i][j] = j
42
+ elif j == 0:
43
+ dp[i][j] = i
44
+ elif reference[i - 1] == candidate[j - 1]:
45
+ dp[i][j] = dp[i - 1][j - 1]
46
+ else:
47
+ dp[i][j] = 1 + min(dp[i][j - 1], # Insert
48
+ dp[i - 1][j], # Remove
49
+ dp[i - 1][j - 1]) # Replace
50
+
51
+ return dp[m][n]
threeD_plot.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import plotly.graph_objects as go
3
+ from scipy.interpolate import griddata
4
+
5
+ def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
6
+ # Convert input lists to NumPy arrays
7
+ detectability = np.array(detectability_val)
8
+ distortion = np.array(distortion_val)
9
+ euclidean = np.array(euclidean_val)
10
+
11
+ # Normalize the values to range [0, 1]
12
+ def normalize(data):
13
+ min_val, max_val = np.min(data), np.max(data)
14
+ return (data - min_val) / (max_val - min_val) if max_val > min_val else np.zeros_like(data)
15
+
16
+ norm_detectability = normalize(detectability)
17
+ norm_distortion = normalize(distortion)
18
+ norm_euclidean = normalize(euclidean)
19
+
20
+ # Composite score: maximize detectability, minimize distortion and Euclidean distance
21
+ composite_score = norm_detectability - (norm_distortion + norm_euclidean)
22
+
23
+ # Sweet spot values
24
+ sweet_spot_index = np.argmax(composite_score)
25
+ sweet_spot = (detectability[sweet_spot_index], distortion[sweet_spot_index], euclidean[sweet_spot_index])
26
+
27
+ # Create a meshgrid for interpolation
28
+ x_grid, y_grid = np.meshgrid(
29
+ np.linspace(np.min(detectability), np.max(detectability), 30),
30
+ np.linspace(np.min(distortion), np.max(distortion), 30)
31
+ )
32
+
33
+ # Interpolate z values (Euclidean distances) to fit the grid
34
+ z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='linear')
35
+
36
+ if z_grid is None:
37
+ raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
38
+
39
+ # Create the 3D contour plot with the Plasma color scale
40
+ fig = go.Figure(data=go.Surface(
41
+ z=z_grid,
42
+ x=x_grid,
43
+ y=y_grid,
44
+ contours={"z": {"show": True, "start": np.min(euclidean), "end": np.max(euclidean), "size": 0.1, "usecolormap": True}},
45
+ colorscale='Plasma'
46
+ ))
47
+
48
+ # Add a marker for the sweet spot
49
+ fig.add_trace(go.Scatter3d(
50
+ x=[sweet_spot[0]],
51
+ y=[sweet_spot[1]],
52
+ z=[sweet_spot[2]],
53
+ mode='markers+text',
54
+ marker=dict(size=10, color='red', symbol='circle'),
55
+ text=["Sweet Spot"],
56
+ textposition="top center"
57
+ ))
58
+
59
+ # Set axis labels
60
+ fig.update_layout(
61
+ scene=dict(
62
+ xaxis_title='Detectability Score',
63
+ yaxis_title='Distortion Score',
64
+ zaxis_title='Euclidean Distance'
65
+ ),
66
+ margin=dict(l=0, r=0, b=0, t=0)
67
+ )
68
+
69
+ return fig
tree.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+ import textwrap
3
+ import re
4
+ from collections import defaultdict
5
+
6
+ def apply_lcs_numbering(sentence, common_grams):
7
+ """Apply LCS numbering based on common grams."""
8
+ for idx, lcs in common_grams:
9
+ sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
10
+ return sentence
11
+
12
+ def highlight_words(sentence, color_map):
13
+ """Highlight specified words in a sentence with corresponding colors."""
14
+ for word, color in color_map.items():
15
+ sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
16
+ return sentence
17
+
18
+ def clean_and_wrap_nodes(nodes, highlight_info):
19
+ """Clean nodes by removing labels and wrap text for display."""
20
+ global_color_map = dict(highlight_info)
21
+ cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
22
+ highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
23
+ return ['<br>'.join(textwrap.wrap(node, width=55)) for node in highlighted_nodes]
24
+
25
+ def get_levels_and_edges(nodes):
26
+ """Determine levels and create edges dynamically."""
27
+ levels = {}
28
+ edges = []
29
+ for i, node in enumerate(nodes):
30
+ level = int(node.split()[-1][1])
31
+ levels[i] = level
32
+
33
+ # Create edges from level 0 to level 1 nodes
34
+ root_node = next(i for i, level in levels.items() if level == 0)
35
+ edges.extend((root_node, i) for i, level in levels.items() if level == 1)
36
+
37
+ return levels, edges
38
+
39
+ def calculate_positions(levels):
40
+ """Calculate x, y positions for each node based on levels."""
41
+ positions = {}
42
+ level_heights = defaultdict(int)
43
+ y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
44
+
45
+ for node, level in levels.items():
46
+ level_heights[level] += 1
47
+ x_gap = 2
48
+ l1_y_gap = 10
49
+ positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
50
+ y_offsets[level] += 1
51
+
52
+ return positions
53
+
54
+ def color_highlighted_words(node, color_map):
55
+ """Highlight words in a wrapped node string."""
56
+ parts = re.split(r'(\{\{.*?\}\})', node)
57
+ colored_parts = [
58
+ f"<span style='color: {color_map.get(match.group(1), 'black')};'>{match.group(1)}</span>"
59
+ if (match := re.match(r'\{\{(.*?)\}\}', part))
60
+ else part
61
+ for part in parts
62
+ ]
63
+ return ''.join(colored_parts)
64
+
65
+ def generate_subplot(paraphrased_sentence, scheme_sentences, highlight_info, common_grams, subplot_number):
66
+ """Generate a subplot based on the input sentences and highlight info."""
67
+ # Combine nodes into one list with appropriate labels
68
+ nodes = [paraphrased_sentence + ' L0'] + [s + ' L1' for s in scheme_sentences]
69
+
70
+ # Apply LCS numbering and clean/wrap nodes
71
+ nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
72
+ wrapped_nodes = clean_and_wrap_nodes(nodes, highlight_info)
73
+
74
+ # Get levels and edges
75
+ levels, edges = get_levels_and_edges(nodes)
76
+ positions = calculate_positions(levels)
77
+
78
+ # Create figure
79
+ fig = go.Figure()
80
+
81
+ # Add nodes and edges to the figure
82
+ for i, node in enumerate(wrapped_nodes):
83
+ colored_node = color_highlighted_words(node, dict(highlight_info))
84
+ x, y = positions[i]
85
+
86
+ fig.add_trace(go.Scatter(
87
+ x=[-x], # Reflect the x coordinate
88
+ y=[y],
89
+ mode='markers',
90
+ marker=dict(size=10, color='blue'),
91
+ hoverinfo='none'
92
+ ))
93
+ fig.add_annotation(
94
+ x=-x, # Reflect the x coordinate
95
+ y=y,
96
+ text=colored_node,
97
+ showarrow=False,
98
+ xshift=15,
99
+ align="center",
100
+ font=dict(size=12),
101
+ bordercolor='black',
102
+ borderwidth=1,
103
+ borderpad=2,
104
+ bgcolor='white',
105
+ width=300,
106
+ height=120
107
+ )
108
+
109
+ # Add edges and edge annotations
110
+ edge_texts = [
111
+ "Highest Entropy Masking", "Pseudo-random Masking", "Random Masking",
112
+ "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
113
+ "Inverse Transform Sampling", "Greedy Sampling", "Temperature Sampling",
114
+ "Exponential Minimum Sampling", "Inverse Transform Sampling",
115
+ "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
116
+ "Inverse Transform Sampling"
117
+ ]
118
+
119
+ for i, edge in enumerate(edges):
120
+ x0, y0 = positions[edge[0]]
121
+ x1, y1 = positions[edge[1]]
122
+ fig.add_trace(go.Scatter(
123
+ x=[-x0, -x1], # Reflect the x coordinates
124
+ y=[y0, y1],
125
+ mode='lines',
126
+ line=dict(color='black', width=1)
127
+ ))
128
+
129
+ # Add text annotation above the edge
130
+ mid_x = (-x0 + -x1) / 2
131
+ mid_y = (y0 + y1) / 2
132
+ fig.add_annotation(
133
+ x=mid_x,
134
+ y=mid_y + 0.8, # Adjust y position to shift text upwards
135
+ text=edge_texts[i], # Use the text specific to this edge
136
+ showarrow=False,
137
+ font=dict(size=12),
138
+ align="center"
139
+ )
140
+
141
+ fig.update_layout(
142
+ showlegend=False,
143
+ margin=dict(t=20, b=20, l=20, r=20),
144
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
145
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
146
+ width=1435,
147
+ height=1000
148
+ )
149
+
150
+ return fig
151
+
152
+ def generate_subplot1(paraphrased_sentence, scheme_sentences, highlight_info, common_grams):
153
+ return generate_subplot(paraphrased_sentence, scheme_sentences, highlight_info, common_grams, subplot_number=1)
154
+
155
+ def generate_subplot2(scheme_sentences, sampled_sentence, highlight_info, common_grams):
156
+ nodes = scheme_sentences + [s + ' L1' for s in sampled_sentence]
157
+ for i in range(len(scheme_sentences)):
158
+ nodes[i] += ' L0' # Reassign levels
159
+
160
+ # Apply LCS numbering and clean/wrap nodes
161
+ nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
162
+ wrapped_nodes = clean_and_wrap_nodes(nodes, highlight_info)
163
+
164
+ # Get levels and edges
165
+ levels, edges = get_levels_and_edges(nodes)
166
+ positions = calculate_positions(levels)
167
+
168
+ # Create figure
169
+ fig2 = go.Figure()
170
+
171
+ # Add nodes and edges to the figure
172
+ for i, node in enumerate(wrapped_nodes):
173
+ colored_node = color_highlighted_words(node, dict(highlight_info))
174
+ x, y = positions[i]
175
+
176
+ fig2.add_trace(go.Scatter(
177
+ x=[-x], # Reflect the x coordinate
178
+ y=[y],
179
+ mode='markers',
180
+ marker=dict(size=10, color='blue'),
181
+ hoverinfo='none'
182
+ ))
183
+ fig2.add_annotation(
184
+ x=-x, # Reflect the x coordinate
185
+ y=y,
186
+ text=colored_node,
187
+ showarrow=False,
188
+ xshift=15,
189
+ align="center",
190
+ font=dict(size=12),
191
+ bordercolor='black',
192
+ borderwidth=1,
193
+ borderpad=2,
194
+ bgcolor='white',
195
+ width=450,
196
+ height=65
197
+ )
198
+
199
+ # Add edges and text above each edge
200
+ edge_texts = [
201
+ "Highest Entropy Masking", "Pseudo-random Masking", "Random Masking",
202
+ "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
203
+ "Inverse Transform Sampling", "Greedy Sampling", "Temperature Sampling",
204
+ "Exponential Minimum Sampling", "Inverse Transform Sampling",
205
+ "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
206
+ "Inverse Transform Sampling"
207
+ ]
208
+
209
+ for i, edge in enumerate(edges):
210
+ x0, y0 = positions[edge[0]]
211
+ x1, y1 = positions[edge[1]]
212
+ fig2.add_trace(go.Scatter(
213
+ x=[-x0, -x1], # Reflect the x coordinates
214
+ y=[y0, y1],
215
+ mode='lines',
216
+ line=dict(color='black', width=1)
217
+ ))
218
+
219
+ # Add text annotation above the edge
220
+ mid_x = (-x0 + -x1) / 2
221
+ mid_y = (y0 + y1) / 2
222
+ fig2.add_annotation(
223
+ x=mid_x,
224
+ y=mid_y + 0.8, # Adjust y position to shift text upwards
225
+ text=edge_texts[i], # Use the text specific to this edge
226
+ showarrow=False,
227
+ font=dict(size=12),
228
+ align="center"
229
+ )
230
+
231
+ fig2.update_layout(
232
+ showlegend=False,
233
+ margin=dict(t=20, b=20, l=20, r=20),
234
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
235
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
236
+ width=1435,
237
+ height=1000
238
+ )
239
+
240
+ return fig2
vocabulary_split.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
4
+
5
+ # Load tokenizer and model once
6
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
7
+ model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
8
+
9
+ def split_vocabulary(seed=42):
10
+ """Split the vocabulary into permissible and non-permissible buckets."""
11
+ # Get the full vocabulary
12
+ vocab = list(tokenizer.get_vocab().items())
13
+
14
+ # Initialize the random number generator
15
+ random.seed(seed)
16
+
17
+ # Split the vocabulary
18
+ permissible = {}
19
+ non_permissible = {}
20
+
21
+ for word, index in vocab:
22
+ target_dict = permissible if random.random() < 0.5 else non_permissible
23
+ target_dict[word] = index
24
+
25
+ return permissible, non_permissible
26
+
27
+ def get_logits_for_mask(sentence):
28
+ """Get the logits for the masked token in the sentence."""
29
+ inputs = tokenizer(sentence, return_tensors="pt")
30
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
31
+
32
+ with torch.no_grad():
33
+ logits = model(**inputs).logits[0, mask_token_index, :]
34
+
35
+ return logits.squeeze()
36
+
37
+ def filter_logits(logits, permissible_indices):
38
+ """Filter logits based on permissible indices."""
39
+ filtered_logits = logits.clone()
40
+
41
+ # Set logits to -inf for non-permissible indices
42
+ filtered_logits[~permissible_indices] = float('-inf')
43
+
44
+ return filtered_logits
45
+
46
+ # Usage example
47
+ permissible, _ = split_vocabulary(seed=42)
48
+
49
+ # Create permissible indices tensor
50
+ permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))], dtype=torch.bool)
51
+
52
+ # When sampling:
53
+ sentence = "The [MASK] is bright today."
54
+ logits = get_logits_for_mask(sentence)
55
+ filtered_logits = filter_logits(logits, permissible_indices)
56
+
watermark_detector.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.corpus import stopwords
3
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
4
+ from vocabulary_split import split_vocabulary, filter_logits
5
+ import torch
6
+ from lcs import find_common_subsequences
7
+ from paraphraser import generate_paraphrase
8
+
9
+ nltk.download('punkt', quiet=True)
10
+ nltk.download('stopwords', quiet=True)
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
13
+ model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
14
+
15
+ permissible, _ = split_vocabulary(seed=42)
16
+ permissible_indices = torch.tensor([i in permissible.values() for i in range(len(tokenizer))])
17
+
18
+ def get_non_melting_points(original_sentence):
19
+ paraphrased_sentences = generate_paraphrase(original_sentence)
20
+ common_subsequences = find_common_subsequences(original_sentence, paraphrased_sentences)
21
+ return common_subsequences
22
+
23
+ def get_word_between_points(sentence, start_point, end_point):
24
+ words = nltk.word_tokenize(sentence)
25
+ stop_words = set(stopwords.words('english'))
26
+ start_index = sentence.index(start_point[1])
27
+ end_index = sentence.index(end_point[1])
28
+
29
+ for word in words[start_index+1:end_index]:
30
+ if word.lower() not in stop_words:
31
+ return word, words.index(word)
32
+ return None, None
33
+
34
+ def get_logits_for_mask(sentence):
35
+ inputs = tokenizer(sentence, return_tensors="pt")
36
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
37
+
38
+ with torch.no_grad():
39
+ outputs = model(**inputs)
40
+
41
+ logits = outputs.logits
42
+ mask_token_logits = logits[0, mask_token_index, :]
43
+ return mask_token_logits.squeeze()
44
+
45
+ def detect_watermark(sentence):
46
+ non_melting_points = get_non_melting_points(sentence)
47
+
48
+ if len(non_melting_points) < 2:
49
+ return False, "Not enough non-melting points found."
50
+
51
+ word_to_check, index = get_word_between_points(sentence, non_melting_points[0], non_melting_points[1])
52
+
53
+ if word_to_check is None:
54
+ return False, "No suitable word found between non-melting points."
55
+
56
+ words = nltk.word_tokenize(sentence)
57
+ masked_sentence = ' '.join(words[:index] + ['[MASK]'] + words[index+1:])
58
+
59
+ logits = get_logits_for_mask(masked_sentence)
60
+ filtered_logits = filter_logits(logits, permissible_indices)
61
+
62
+ top_predictions = filtered_logits.argsort()[-5:]
63
+ predicted_words = [tokenizer.decode([i]) for i in top_predictions]
64
+
65
+ if word_to_check in predicted_words:
66
+ return True, f"Watermark detected. The word '{word_to_check}' is in the permissible vocabulary."
67
+ else:
68
+ return False, f"No watermark detected. The word '{word_to_check}' is not in the permissible vocabulary."
69
+
70
+ # Example usage
71
+ # if __name__ == "__main__":
72
+ # test_sentence = "The quick brown fox jumps over the lazy dog."
73
+ # is_watermarked, message = detect_watermark(test_sentence)
74
+ # print(f"Is the sentence watermarked? {is_watermarked}")
75
+ # print(f"Detection message: {message}")