jgyasu commited on
Commit
ea7f5b6
1 Parent(s): 814be65

Upload folder using huggingface_hub

Browse files
Files changed (9) hide show
  1. .gitignore +1 -0
  2. README.md +3 -9
  3. app.py +284 -0
  4. masking_methods.py +25 -0
  5. paraphraser.py +29 -0
  6. requirements.text +17 -0
  7. sampling_methods.py +132 -0
  8. scores.py +51 -0
  9. tree.py +115 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: AIISC Watermarking Model
3
- emoji: 💻
4
- colorFrom: red
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 4.37.2
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: AIISC-Watermarking-Model
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.36.0
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from transformers import AutoModelForSeq2SeqLM
3
+ import plotly.graph_objs as go
4
+ import textwrap
5
+ from transformers import pipeline
6
+ import re
7
+ import time
8
+ import requests
9
+ from PIL import Image
10
+ import itertools
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import matplotlib
14
+ from matplotlib.colors import ListedColormap, rgb2hex
15
+ import ipywidgets as widgets
16
+ from IPython.display import display, HTML
17
+ import pandas as pd
18
+ from pprint import pprint
19
+ from tenacity import retry
20
+ from tqdm import tqdm
21
+ import scipy.stats
22
+ import torch
23
+ from transformers import GPT2LMHeadModel
24
+ import seaborn as sns
25
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM
26
+ import random
27
+ from nltk.corpus import stopwords
28
+ from termcolor import colored
29
+ import nltk
30
+ from nltk.translate.bleu_score import sentence_bleu
31
+ from transformers import BertTokenizer, BertModel
32
+ import graphviz
33
+ import gradio as gr
34
+ from tree import generate_plot
35
+ from paraphraser import generate_paraphrase
36
+
37
+ nltk.download('stopwords')
38
+
39
+
40
+ # Function to Find the Longest Common Substring Words Subsequence
41
+ def longest_common_subss(original_sentence, paraphrased_sentences):
42
+ stop_words = set(stopwords.words('english'))
43
+ original_sentence_lower = original_sentence.lower()
44
+ paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
45
+ paraphrased_sentences_no_stopwords = []
46
+
47
+ for sentence in paraphrased_sentences_lower:
48
+ words = re.findall(r'\b\w+\b', sentence)
49
+ filtered_sentence = ' '.join([word for word in words if word not in stop_words])
50
+ paraphrased_sentences_no_stopwords.append(filtered_sentence)
51
+
52
+ results = []
53
+ for sentence in paraphrased_sentences_no_stopwords:
54
+ common_words = set(original_sentence_lower.split()) & set(sentence.split())
55
+ for word in common_words:
56
+ sentence = sentence.replace(word, colored(word, 'green'))
57
+ results.append({
58
+ "Original Sentence": original_sentence_lower,
59
+ "Paraphrased Sentence": sentence,
60
+ "Substrings Word Pair": common_words
61
+ })
62
+ return results
63
+
64
+ # Function to Find Common Substring Word between each paraphrase sentences
65
+ def common_substring_word(original_sentence, paraphrased_sentences):
66
+ stop_words = set(stopwords.words('english'))
67
+ original_sentence_lower = original_sentence.lower()
68
+ paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
69
+ paraphrased_sentences_no_stopwords = []
70
+
71
+ for sentence in paraphrased_sentences_lower:
72
+ words = re.findall(r'\b\w+\b', sentence)
73
+ filtered_sentence = ' '.join([word for word in words if word not in stop_words])
74
+ paraphrased_sentences_no_stopwords.append(filtered_sentence)
75
+
76
+ results = []
77
+ for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
78
+ common_words = set(original_sentence_lower.split()) & set(sentence.split())
79
+ common_substrings = ', '.join(sorted(common_words))
80
+ for word in common_words:
81
+ sentence = sentence.replace(word, colored(word, 'green'))
82
+ results.append({
83
+ f"Paraphrased Sentence {idx+1}": sentence,
84
+ "Common Substrings": common_substrings
85
+ })
86
+ return results
87
+
88
+
89
+ import re
90
+ from nltk.corpus import stopwords
91
+
92
+ def find_common_subsequences(sentence, str_list):
93
+ stop_words = set(stopwords.words('english'))
94
+ sentence = sentence.lower()
95
+
96
+ str_list = [s.lower() for s in str_list]
97
+
98
+ def is_present(lcs, str_list):
99
+ for string in str_list:
100
+ if lcs not in string:
101
+ return False
102
+ return True
103
+
104
+ def remove_stop_words_and_special_chars(sentence):
105
+ sentence = re.sub(r'[^\w\s]', '', sentence)
106
+ words = sentence.split()
107
+ filtered_words = [word for word in words if word.lower() not in stop_words]
108
+ return " ".join(filtered_words)
109
+
110
+ sentence = remove_stop_words_and_special_chars(sentence)
111
+ str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
112
+
113
+ words = sentence.split(" ")
114
+ common_grams = []
115
+ added_phrases = set()
116
+
117
+ def is_covered(subseq, added_phrases):
118
+ for phrase in added_phrases:
119
+ if subseq in phrase:
120
+ return True
121
+ return False
122
+
123
+ for i in range(len(words) - 4):
124
+ penta = " ".join(words[i:i+5])
125
+ if is_present(penta, str_list):
126
+ common_grams.append(penta)
127
+ added_phrases.add(penta)
128
+
129
+ for i in range(len(words) - 3):
130
+ quad = " ".join(words[i:i+4])
131
+ if is_present(quad, str_list) and not is_covered(quad, added_phrases):
132
+ common_grams.append(quad)
133
+ added_phrases.add(quad)
134
+
135
+ for i in range(len(words) - 2):
136
+ tri = " ".join(words[i:i+3])
137
+ if is_present(tri, str_list) and not is_covered(tri, added_phrases):
138
+ common_grams.append(tri)
139
+ added_phrases.add(tri)
140
+
141
+ for i in range(len(words) - 1):
142
+ bi = " ".join(words[i:i+2])
143
+ if is_present(bi, str_list) and not is_covered(bi, added_phrases):
144
+ common_grams.append(bi)
145
+ added_phrases.add(bi)
146
+
147
+ for i in range(len(words)):
148
+ uni = words[i]
149
+ if is_present(uni, str_list) and not is_covered(uni, added_phrases):
150
+ common_grams.append(uni)
151
+ added_phrases.add(uni)
152
+
153
+ return common_grams
154
+
155
+ def llm_output(prompt):
156
+ return prompt, prompt
157
+
158
+ def highlight_phrases_with_colors(sentences, phrases):
159
+ color_map = {}
160
+ color_index = 0
161
+ highlighted_html = []
162
+ idx = 1
163
+ for sentence in sentences:
164
+ sentence_with_idx = f"{idx}. {sentence}"
165
+ idx += 1
166
+ highlighted_sentence = sentence_with_idx
167
+ phrase_count = 0
168
+ words = re.findall(r'\b\w+\b', sentence)
169
+ word_index = 1
170
+ for phrase in phrases:
171
+ if phrase not in color_map:
172
+ color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
173
+ color_index += 1
174
+ escaped_phrase = re.escape(phrase)
175
+ pattern = rf'\b{escaped_phrase}\b'
176
+ highlighted_sentence, num_replacements = re.subn(
177
+ pattern,
178
+ lambda m, count=phrase_count, color=color_map[phrase], index=word_index: (
179
+ f'<span style="background-color: {color}; font-weight: bold;'
180
+ f' padding: 2px 4px; border-radius: 2px; position: relative;">'
181
+ f'<span style="background-color: black; color: white; border-radius: 50%;'
182
+ f' padding: 2px 5px; margin-right: 5px;">{index}</span>'
183
+ f'{m.group(0)}'
184
+ f'</span>'
185
+ ),
186
+ highlighted_sentence,
187
+ flags=re.IGNORECASE
188
+ )
189
+ if num_replacements > 0:
190
+ phrase_count += 1
191
+ word_index += 1
192
+ highlighted_html.append(highlighted_sentence)
193
+ final_html = "<br><br>".join(highlighted_html)
194
+ return f'''
195
+ <div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 2px;">
196
+ <h3 style="margin-top: 0; font-size: 1em; color: #111827;">Paraphrased And Highlighted Text</h3>
197
+ <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 2px;">{final_html}</div>
198
+ </div>
199
+ '''
200
+
201
+ import re
202
+
203
+ def highlight_phrases_with_colors_single_sentence(sentence, phrases):
204
+ color_map = {}
205
+ color_index = 0
206
+ highlighted_sentence = sentence
207
+ phrase_count = 0
208
+ words = re.findall(r'\b\w+\b', sentence)
209
+ word_index = 1
210
+
211
+ for phrase in phrases:
212
+ if phrase not in color_map:
213
+ color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
214
+ color_index += 1
215
+ escaped_phrase = re.escape(phrase)
216
+ pattern = rf'\b{escaped_phrase}\b'
217
+ highlighted_sentence, num_replacements = re.subn(
218
+ pattern,
219
+ lambda m, count=phrase_count, color=color_map[phrase], index=word_index: (
220
+ f'<span style="background-color: {color}; font-weight: bold;'
221
+ f' padding: 2px 4px; border-radius: 2px; position: relative;">'
222
+ f'<span style="background-color: black; color: white; border-radius: 50%;'
223
+ f' padding: 2px 5px; margin-right: 5px;">{index}</span>'
224
+ f'{m.group(0)}'
225
+ f'</span>'
226
+ ),
227
+ highlighted_sentence,
228
+ flags=re.IGNORECASE
229
+ )
230
+ if num_replacements > 0:
231
+ phrase_count += 1
232
+ word_index += 1
233
+
234
+ final_html = highlighted_sentence
235
+ return f'''
236
+ <div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 2px;">
237
+ <h3 style="margin-top: 0; font-size: 1em; color: #111827;">Selected Sentence</h3>
238
+ <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 2px;">{final_html}</div>
239
+ </div>
240
+ '''
241
+
242
+
243
+ # Function for the Gradio interface
244
+ def model(prompt):
245
+ generated, sentence = llm_output(prompt)
246
+ res = generate_paraphrase(sentence)
247
+ common_subs = longest_common_subss(sentence, res)
248
+ common_grams = find_common_subsequences(sentence, res)
249
+ for i in range(len(common_subs)):
250
+ common_subs[i]["Paraphrased Sentence"] = res[i]
251
+ generated_highlighted = highlight_phrases_with_colors_single_sentence(generated, common_grams)
252
+ result = highlight_phrases_with_colors(res, common_grams)
253
+ tree = generate_plot(sentence)
254
+ return generated, generated_highlighted, result, tree
255
+
256
+ with gr.Blocks(theme = gr.themes.Monochrome()) as demo:
257
+ gr.Markdown("# Paraphrases the Text and Highlights the Non-melting Points")
258
+
259
+ with gr.Row():
260
+ user_input = gr.Textbox(label="User Prompt")
261
+
262
+ with gr.Row():
263
+ submit_button = gr.Button("Submit")
264
+ clear_button = gr.Button("Clear")
265
+
266
+ with gr.Row():
267
+ ai_output = gr.Textbox(label="AI-generated Text (Llama3)")
268
+
269
+ with gr.Row():
270
+ selected_sentence = gr.HTML()
271
+
272
+ with gr.Row():
273
+ html_output = gr.HTML()
274
+
275
+ with gr.Row():
276
+ tree = gr.Plot()
277
+
278
+ submit_button.click(model, inputs=user_input, outputs=[ai_output, selected_sentence, html_output, tree])
279
+ clear_button.click(lambda: "", inputs=None, outputs=user_input)
280
+ clear_button.click(lambda: "", inputs=None, outputs=[ai_output, selected_sentence, html_output, tree])
281
+
282
+ # Launch the demo
283
+ demo.launch(share=True)
284
+
masking_methods.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
2
+ from transformers import pipeline
3
+ import random
4
+ from nltk.corpus import stopwords
5
+
6
+ # Masking Model
7
+ def mask_non_stopword(sentence):
8
+ stop_words = set(stopwords.words('english'))
9
+ words = sentence.split()
10
+ non_stop_words = [word for word in words if word.lower() not in stop_words]
11
+ if not non_stop_words:
12
+ return sentence
13
+ word_to_mask = random.choice(non_stop_words)
14
+ masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
15
+ return masked_sentence
16
+
17
+ # Load tokenizer and model for masked language model
18
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
19
+ model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
20
+ fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
21
+
22
+ def mask(sentence):
23
+ predictions = fill_mask(sentence)
24
+ masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))]
25
+ return masked_sentences
paraphraser.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+
3
+ # Function to Initialize the Model
4
+ def init_model():
5
+ para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
6
+ para_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
7
+ return para_tokenizer, para_model
8
+
9
+ # Function to Paraphrase the Text
10
+ def paraphrase(question, para_tokenizer, para_model, num_beams=5, num_beam_groups=5, num_return_sequences=5, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64):
11
+ input_ids = para_tokenizer(
12
+ f'paraphrase: {question}',
13
+ return_tensors="pt", padding="longest",
14
+ max_length=max_length,
15
+ truncation=True,
16
+ ).input_ids
17
+ outputs = para_model.generate(
18
+ input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
19
+ num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
20
+ num_beams=num_beams, num_beam_groups=num_beam_groups,
21
+ max_length=max_length, diversity_penalty=diversity_penalty
22
+ )
23
+ res = para_tokenizer.batch_decode(outputs, skip_special_tokens=True)
24
+ return res
25
+
26
+ def generate_paraphrase(question):
27
+ para_tokenizer, para_model = init_model()
28
+ res = paraphrase(question, para_tokenizer, para_model)
29
+ return res
requirements.text ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ipywidgets
2
+ transformers
3
+ plotly
4
+ requests
5
+ Pillow
6
+ numpy
7
+ matplotlib
8
+ tqdm
9
+ scipy
10
+ torch
11
+ seaborn
12
+ termcolor
13
+ nltk
14
+ tenacity
15
+ pandas
16
+ graphviz==0.20.3
17
+ gradio
sampling_methods.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from nltk.corpus import stopwords
3
+ import random
4
+ from termcolor import colored
5
+
6
+ # Function to Watermark a Word Take Randomly Between Each lcs Point (Random Sampling)
7
+ def random_sampling(original_sentence, paraphrased_sentences):
8
+ stop_words = set(stopwords.words('english'))
9
+ original_sentence_lower = original_sentence.lower()
10
+ paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
11
+ paraphrased_sentences_no_stopwords = []
12
+
13
+ for sentence in paraphrased_sentences_lower:
14
+ words = re.findall(r'\b\w+\b', sentence)
15
+ filtered_sentence = ' '.join([word for word in words if word not in stop_words])
16
+ paraphrased_sentences_no_stopwords.append(filtered_sentence)
17
+
18
+ results = []
19
+ for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
20
+ common_words = set(original_sentence_lower.split()) & set(sentence.split())
21
+ common_substrings = ', '.join(sorted(common_words))
22
+
23
+ words_to_replace = [word for word in sentence.split() if word not in common_words]
24
+ if words_to_replace:
25
+ word_to_mark = random.choice(words_to_replace)
26
+ sentence = sentence.replace(word_to_mark, colored(word_to_mark, 'red'))
27
+
28
+ for word in common_words:
29
+ sentence = sentence.replace(word, colored(word, 'green'))
30
+
31
+ results.append({
32
+ f"Paraphrased Sentence {idx+1}": sentence,
33
+ "Common Substrings": common_substrings
34
+ })
35
+ return results
36
+
37
+ # Function for Inverse Transform Sampling
38
+ def inverse_transform_sampling(original_sentence, paraphrased_sentences):
39
+ stop_words = set(stopwords.words('english'))
40
+ original_sentence_lower = original_sentence.lower()
41
+ paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
42
+ paraphrased_sentences_no_stopwords = []
43
+
44
+ for sentence in paraphrased_sentences_lower:
45
+ words = re.findall(r'\b\w+\b', sentence)
46
+ filtered_sentence = ' '.join([word for word in words if word not in stop_words])
47
+ paraphrased_sentences_no_stopwords.append(filtered_sentence)
48
+
49
+ results = []
50
+ for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
51
+ common_words = set(original_sentence_lower.split()) & set(sentence.split())
52
+ common_substrings = ', '.join(sorted(common_words))
53
+
54
+ words_to_replace = [word for word in sentence.split() if word not in common_words]
55
+ if words_to_replace:
56
+ probabilities = [1 / len(words_to_replace)] * len(words_to_replace)
57
+ chosen_word = random.choices(words_to_replace, weights=probabilities)[0]
58
+ sentence = sentence.replace(chosen_word, colored(chosen_word, 'magenta'))
59
+
60
+ for word in common_words:
61
+ sentence = sentence.replace(word, colored(word, 'green'))
62
+
63
+ results.append({
64
+ f"Paraphrased Sentence {idx+1}": sentence,
65
+ "Common Substrings": common_substrings
66
+ })
67
+ return results
68
+
69
+ # Function for Contextual Sampling
70
+ def contextual_sampling(original_sentence, paraphrased_sentences):
71
+ stop_words = set(stopwords.words('english'))
72
+ original_sentence_lower = original_sentence.lower()
73
+ paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
74
+ paraphrased_sentences_no_stopwords = []
75
+
76
+ for sentence in paraphrased_sentences_lower:
77
+ words = re.findall(r'\b\w+\b', sentence)
78
+ filtered_sentence = ' '.join([word for word in words if word not in stop_words])
79
+ paraphrased_sentences_no_stopwords.append(filtered_sentence)
80
+
81
+ results = []
82
+ for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
83
+ common_words = set(original_sentence_lower.split()) & set(sentence.split())
84
+ common_substrings = ', '.join(sorted(common_words))
85
+
86
+ words_to_replace = [word for word in sentence.split() if word not in common_words]
87
+ if words_to_replace:
88
+ context = " ".join([word for word in sentence.split() if word not in common_words])
89
+ chosen_word = random.choice(words_to_replace)
90
+ sentence = sentence.replace(chosen_word, colored(chosen_word, 'red'))
91
+
92
+ for word in common_words:
93
+ sentence = sentence.replace(word, colored(word, 'green'))
94
+
95
+ results.append({
96
+ f"Paraphrased Sentence {idx+1}": sentence,
97
+ "Common Substrings": common_substrings
98
+ })
99
+ return results
100
+
101
+ # Function for Exponential Minimum Sampling
102
+ def exponential_minimum_sampling(original_sentence, paraphrased_sentences):
103
+ stop_words = set(stopwords.words('english'))
104
+ original_sentence_lower = original_sentence.lower()
105
+ paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
106
+ paraphrased_sentences_no_stopwords = []
107
+
108
+ for sentence in paraphrased_sentences_lower:
109
+ words = re.findall(r'\b\w+\b', sentence)
110
+ filtered_sentence = ' '.join([word for word in words if word not in stop_words])
111
+ paraphrased_sentences_no_stopwords.append(filtered_sentence)
112
+
113
+ results = []
114
+ for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
115
+ common_words = set(original_sentence_lower.split()) & set(sentence.split())
116
+ common_substrings = ', '.join(sorted(common_words))
117
+
118
+ words_to_replace = [word for word in sentence.split() if word not in common_words]
119
+ if words_to_replace:
120
+ num_words = len(words_to_replace)
121
+ probabilities = [2 ** (-i) for i in range(num_words)]
122
+ chosen_word = random.choices(words_to_replace, weights=probabilities)[0]
123
+ sentence = sentence.replace(chosen_word, colored(chosen_word, 'red'))
124
+
125
+ for word in common_words:
126
+ sentence = sentence.replace(word, colored(word, 'green'))
127
+
128
+ results.append({
129
+ f"Paraphrased Sentence {idx+1}": sentence,
130
+ "Common Substrings": common_substrings
131
+ })
132
+ return results
scores.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from nltk.translate.bleu_score import sentence_bleu
4
+ from transformers import BertTokenizer, BertModel
5
+
6
+ # Function to Calculate the BLEU score
7
+ def calculate_bleu(reference, candidate):
8
+ return sentence_bleu([reference], candidate)
9
+
10
+ # Function to calculate BERT score
11
+ def calculate_bert(reference, candidate):
12
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
13
+ model = BertModel.from_pretrained('bert-base-uncased')
14
+
15
+ reference_tokens = tokenizer.tokenize(reference)
16
+ candidate_tokens = tokenizer.tokenize(candidate)
17
+
18
+ reference_ids = tokenizer.encode(reference, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
19
+ candidate_ids = tokenizer.encode(candidate, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
20
+
21
+ with torch.no_grad():
22
+ reference_outputs = model(reference_ids)
23
+ candidate_outputs = model(candidate_ids)
24
+
25
+ reference_embeddings = reference_outputs[0][:, 0, :].numpy()
26
+ candidate_embeddings = candidate_outputs[0][:, 0, :].numpy()
27
+
28
+ cosine_similarity = np.dot(reference_embeddings, candidate_embeddings.T) / (np.linalg.norm(reference_embeddings) * np.linalg.norm(candidate_embeddings))
29
+ return np.mean(cosine_similarity)
30
+
31
+ # Function to calculate minimum edit distance
32
+ def min_edit_distance(reference, candidate):
33
+ m = len(reference)
34
+ n = len(candidate)
35
+
36
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
37
+
38
+ for i in range(m + 1):
39
+ for j in range(n + 1):
40
+ if i == 0:
41
+ dp[i][j] = j
42
+ elif j == 0:
43
+ dp[i][j] = i
44
+ elif reference[i - 1] == candidate[j - 1]:
45
+ dp[i][j] = dp[i - 1][j - 1]
46
+ else:
47
+ dp[i][j] = 1 + min(dp[i][j - 1], # Insert
48
+ dp[i - 1][j], # Remove
49
+ dp[i - 1][j - 1]) # Replace
50
+
51
+ return dp[m][n]
tree.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objs as go
2
+ import textwrap
3
+ import re
4
+ from collections import defaultdict
5
+ from paraphraser import generate_paraphrase
6
+ from masking_methods import mask, mask_non_stopword
7
+
8
+ def generate_plot(original_sentence):
9
+ paraphrased_sentences = generate_paraphrase(original_sentence)
10
+ first_paraphrased_sentence = paraphrased_sentences[0]
11
+ masked_sentence = mask_non_stopword(first_paraphrased_sentence)
12
+ masked_versions = mask(masked_sentence)
13
+
14
+ nodes = []
15
+ nodes.append(original_sentence)
16
+ nodes.extend(paraphrased_sentences)
17
+ nodes.extend(masked_versions)
18
+ nodes[0] += ' L0'
19
+ para_len = len(paraphrased_sentences)
20
+ for i in range(1, para_len+1):
21
+ nodes[i] += ' L1'
22
+ for i in range(para_len+1, len(nodes)):
23
+ nodes[i] += ' L2'
24
+
25
+ cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
26
+ wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=30)) for node in cleaned_nodes]
27
+
28
+ def get_levels_and_edges(nodes):
29
+ levels = {}
30
+ edges = []
31
+ for i, node in enumerate(nodes):
32
+ level = int(node.split()[-1][1])
33
+ levels[i] = level
34
+
35
+ # Add edges from L0 to all L1 nodes
36
+ root_node = next(i for i, level in levels.items() if level == 0)
37
+ for i, level in levels.items():
38
+ if level == 1:
39
+ edges.append((root_node, i))
40
+
41
+ # Identify the first L1 node
42
+ first_l1_node = next(i for i, level in levels.items() if level == 1)
43
+ # Add edges from the first L1 node to all L2 nodes
44
+ for i, level in levels.items():
45
+ if level == 2:
46
+ edges.append((first_l1_node, i))
47
+
48
+ return levels, edges
49
+
50
+ # Get levels and dynamic edges
51
+ levels, edges = get_levels_and_edges(nodes)
52
+ max_level = max(levels.values())
53
+
54
+ # Calculate positions
55
+ positions = {}
56
+ level_widths = defaultdict(int)
57
+ for node, level in levels.items():
58
+ level_widths[level] += 1
59
+
60
+ x_offsets = {level: - (width - 1) / 2 for level, width in level_widths.items()}
61
+ y_gap = 4
62
+
63
+ for node, level in levels.items():
64
+ positions[node] = (x_offsets[level], -level * y_gap)
65
+ x_offsets[level] += 1
66
+
67
+ # Create figure
68
+ fig = go.Figure()
69
+
70
+ # Add nodes to the figure
71
+ for i, node in enumerate(wrapped_nodes):
72
+ x, y = positions[i]
73
+ fig.add_trace(go.Scatter(
74
+ x=[x],
75
+ y=[y],
76
+ mode='markers',
77
+ marker=dict(size=10, color='blue'),
78
+ hoverinfo='none'
79
+ ))
80
+ fig.add_annotation(
81
+ x=x,
82
+ y=y,
83
+ text=node,
84
+ showarrow=False,
85
+ yshift=20, # Adjust the y-shift value to avoid overlap
86
+ align="center",
87
+ font=dict(size=10),
88
+ bordercolor='black',
89
+ borderwidth=1,
90
+ borderpad=4,
91
+ bgcolor='white',
92
+ width=200
93
+ )
94
+
95
+ # Add edges to the figure
96
+ for edge in edges:
97
+ x0, y0 = positions[edge[0]]
98
+ x1, y1 = positions[edge[1]]
99
+ fig.add_trace(go.Scatter(
100
+ x=[x0, x1],
101
+ y=[y0, y1],
102
+ mode='lines',
103
+ line=dict(color='black', width=2)
104
+ ))
105
+
106
+ fig.update_layout(
107
+ showlegend=False,
108
+ margin=dict(t=50, b=50, l=50, r=50),
109
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
110
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
111
+ width=1470,
112
+ height=800 # Increase height to provide more space
113
+ )
114
+
115
+ return fig