Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer | |
from transformers import AutoModelForSeq2SeqLM | |
import plotly.graph_objs as go | |
import textwrap | |
from transformers import pipeline | |
import re | |
import time | |
import requests | |
from PIL import Image | |
import itertools | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib | |
from matplotlib.colors import ListedColormap, rgb2hex | |
import ipywidgets as widgets | |
from IPython.display import display, HTML | |
import pandas as pd | |
from pprint import pprint | |
from tenacity import retry | |
from tqdm import tqdm | |
import scipy.stats | |
import torch | |
from transformers import GPT2LMHeadModel | |
import seaborn as sns | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM | |
import random | |
from nltk.corpus import stopwords | |
from termcolor import colored | |
import nltk | |
from nltk.translate.bleu_score import sentence_bleu | |
from transformers import BertTokenizer, BertModel | |
import graphviz | |
import gradio as gr | |
from tree import generate_plot | |
from paraphraser import generate_paraphrase | |
nltk.download('stopwords') | |
# Function to Find the Longest Common Substring Words Subsequence | |
def longest_common_subss(original_sentence, paraphrased_sentences): | |
stop_words = set(stopwords.words('english')) | |
original_sentence_lower = original_sentence.lower() | |
paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences] | |
paraphrased_sentences_no_stopwords = [] | |
for sentence in paraphrased_sentences_lower: | |
words = re.findall(r'\b\w+\b', sentence) | |
filtered_sentence = ' '.join([word for word in words if word not in stop_words]) | |
paraphrased_sentences_no_stopwords.append(filtered_sentence) | |
results = [] | |
for sentence in paraphrased_sentences_no_stopwords: | |
common_words = set(original_sentence_lower.split()) & set(sentence.split()) | |
for word in common_words: | |
sentence = sentence.replace(word, colored(word, 'green')) | |
results.append({ | |
"Original Sentence": original_sentence_lower, | |
"Paraphrased Sentence": sentence, | |
"Substrings Word Pair": common_words | |
}) | |
return results | |
# Function to Find Common Substring Word between each paraphrase sentences | |
def common_substring_word(original_sentence, paraphrased_sentences): | |
stop_words = set(stopwords.words('english')) | |
original_sentence_lower = original_sentence.lower() | |
paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences] | |
paraphrased_sentences_no_stopwords = [] | |
for sentence in paraphrased_sentences_lower: | |
words = re.findall(r'\b\w+\b', sentence) | |
filtered_sentence = ' '.join([word for word in words if word not in stop_words]) | |
paraphrased_sentences_no_stopwords.append(filtered_sentence) | |
results = [] | |
for idx, sentence in enumerate(paraphrased_sentences_no_stopwords): | |
common_words = set(original_sentence_lower.split()) & set(sentence.split()) | |
common_substrings = ', '.join(sorted(common_words)) | |
for word in common_words: | |
sentence = sentence.replace(word, colored(word, 'green')) | |
results.append({ | |
f"Paraphrased Sentence {idx+1}": sentence, | |
"Common Substrings": common_substrings | |
}) | |
return results | |
import re | |
from nltk.corpus import stopwords | |
def find_common_subsequences(sentence, str_list): | |
stop_words = set(stopwords.words('english')) | |
sentence = sentence.lower() | |
str_list = [s.lower() for s in str_list] | |
def is_present(lcs, str_list): | |
for string in str_list: | |
if lcs not in string: | |
return False | |
return True | |
def remove_stop_words_and_special_chars(sentence): | |
sentence = re.sub(r'[^\w\s]', '', sentence) | |
words = sentence.split() | |
filtered_words = [word for word in words if word.lower() not in stop_words] | |
return " ".join(filtered_words) | |
sentence = remove_stop_words_and_special_chars(sentence) | |
str_list = [remove_stop_words_and_special_chars(s) for s in str_list] | |
words = sentence.split(" ") | |
common_grams = [] | |
added_phrases = set() | |
def is_covered(subseq, added_phrases): | |
for phrase in added_phrases: | |
if subseq in phrase: | |
return True | |
return False | |
for i in range(len(words) - 4): | |
penta = " ".join(words[i:i+5]) | |
if is_present(penta, str_list): | |
common_grams.append(penta) | |
added_phrases.add(penta) | |
for i in range(len(words) - 3): | |
quad = " ".join(words[i:i+4]) | |
if is_present(quad, str_list) and not is_covered(quad, added_phrases): | |
common_grams.append(quad) | |
added_phrases.add(quad) | |
for i in range(len(words) - 2): | |
tri = " ".join(words[i:i+3]) | |
if is_present(tri, str_list) and not is_covered(tri, added_phrases): | |
common_grams.append(tri) | |
added_phrases.add(tri) | |
for i in range(len(words) - 1): | |
bi = " ".join(words[i:i+2]) | |
if is_present(bi, str_list) and not is_covered(bi, added_phrases): | |
common_grams.append(bi) | |
added_phrases.add(bi) | |
for i in range(len(words)): | |
uni = words[i] | |
if is_present(uni, str_list) and not is_covered(uni, added_phrases): | |
common_grams.append(uni) | |
added_phrases.add(uni) | |
return common_grams | |
def llm_output(prompt): | |
return prompt, prompt | |
def highlight_phrases_with_colors(sentences, phrases): | |
color_map = {} | |
color_index = 0 | |
highlighted_html = [] | |
idx = 1 | |
for sentence in sentences: | |
sentence_with_idx = f"{idx}. {sentence}" | |
idx += 1 | |
highlighted_sentence = sentence_with_idx | |
phrase_count = 0 | |
words = re.findall(r'\b\w+\b', sentence) | |
word_index = 1 | |
for phrase in phrases: | |
if phrase not in color_map: | |
color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)' | |
color_index += 1 | |
escaped_phrase = re.escape(phrase) | |
pattern = rf'\b{escaped_phrase}\b' | |
highlighted_sentence, num_replacements = re.subn( | |
pattern, | |
lambda m, count=phrase_count, color=color_map[phrase], index=word_index: ( | |
f'<span style="background-color: {color}; font-weight: bold;' | |
f' padding: 2px 4px; border-radius: 2px; position: relative;">' | |
f'<span style="background-color: black; color: white; border-radius: 50%;' | |
f' padding: 2px 5px; margin-right: 5px;">{index}</span>' | |
f'{m.group(0)}' | |
f'</span>' | |
), | |
highlighted_sentence, | |
flags=re.IGNORECASE | |
) | |
if num_replacements > 0: | |
phrase_count += 1 | |
word_index += 1 | |
highlighted_html.append(highlighted_sentence) | |
final_html = "<br><br>".join(highlighted_html) | |
return f''' | |
<div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 2px;"> | |
<h3 style="margin-top: 0; font-size: 1em; color: #111827;">Paraphrased And Highlighted Text</h3> | |
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 2px;">{final_html}</div> | |
</div> | |
''' | |
import re | |
def highlight_phrases_with_colors_single_sentence(sentence, phrases): | |
color_map = {} | |
color_index = 0 | |
highlighted_sentence = sentence | |
phrase_count = 0 | |
words = re.findall(r'\b\w+\b', sentence) | |
word_index = 1 | |
for phrase in phrases: | |
if phrase not in color_map: | |
color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)' | |
color_index += 1 | |
escaped_phrase = re.escape(phrase) | |
pattern = rf'\b{escaped_phrase}\b' | |
highlighted_sentence, num_replacements = re.subn( | |
pattern, | |
lambda m, count=phrase_count, color=color_map[phrase], index=word_index: ( | |
f'<span style="background-color: {color}; font-weight: bold;' | |
f' padding: 2px 4px; border-radius: 2px; position: relative;">' | |
f'<span style="background-color: black; color: white; border-radius: 50%;' | |
f' padding: 2px 5px; margin-right: 5px;">{index}</span>' | |
f'{m.group(0)}' | |
f'</span>' | |
), | |
highlighted_sentence, | |
flags=re.IGNORECASE | |
) | |
if num_replacements > 0: | |
phrase_count += 1 | |
word_index += 1 | |
final_html = highlighted_sentence | |
return f''' | |
<div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 2px;"> | |
<h3 style="margin-top: 0; font-size: 1em; color: #111827;">Selected Sentence</h3> | |
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 2px;">{final_html}</div> | |
</div> | |
''' | |
# Function for the Gradio interface | |
def model(prompt): | |
generated, sentence = llm_output(prompt) | |
res = generate_paraphrase(sentence) | |
common_subs = longest_common_subss(sentence, res) | |
common_grams = find_common_subsequences(sentence, res) | |
for i in range(len(common_subs)): | |
common_subs[i]["Paraphrased Sentence"] = res[i] | |
generated_highlighted = highlight_phrases_with_colors_single_sentence(generated, common_grams) | |
result = highlight_phrases_with_colors(res, common_grams) | |
tree = generate_plot(sentence) | |
return generated, generated_highlighted, result, tree | |
with gr.Blocks(theme = gr.themes.Monochrome()) as demo: | |
gr.Markdown("# Paraphrases the Text and Highlights the Non-melting Points") | |
with gr.Row(): | |
user_input = gr.Textbox(label="User Prompt") | |
with gr.Row(): | |
submit_button = gr.Button("Submit") | |
clear_button = gr.Button("Clear") | |
with gr.Row(): | |
ai_output = gr.Textbox(label="AI-generated Text (Llama3)") | |
with gr.Row(): | |
selected_sentence = gr.HTML() | |
with gr.Row(): | |
html_output = gr.HTML() | |
with gr.Row(): | |
tree = gr.Plot() | |
submit_button.click(model, inputs=user_input, outputs=[ai_output, selected_sentence, html_output, tree]) | |
clear_button.click(lambda: "", inputs=None, outputs=user_input) | |
clear_button.click(lambda: "", inputs=None, outputs=[ai_output, selected_sentence, html_output, tree]) | |
# Launch the demo | |
demo.launch(share=True) | |