Spaces:

jgyasu
/

text-paraphraser

Running

App Files Files Community

jgyasu commited on Jun 20

Commit

be52172

•

1 Parent(s): 5995f2a

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -68

app.py CHANGED Viewed

@@ -7,10 +7,6 @@ Original file is located at
     https://colab.research.google.com/drive/1pFGR4uvXMMWVJFQeFmn--arumSxqa5Yy
 """
-import gradio as gr
-# import streamlit as st
 from transformers import AutoTokenizer
 from transformers import AutoModelForSeq2SeqLM
 import plotly.graph_objects as go
@@ -35,7 +31,7 @@ import scipy.stats
 import torch
 from transformers import GPT2LMHeadModel
 import seaborn as sns
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 # from colorama import Fore, Style
 # import openai
 import random
@@ -44,8 +40,11 @@ from termcolor import colored
 import nltk
 from nltk.translate.bleu_score import sentence_bleu
 from transformers import BertTokenizer, BertModel
-import nltk
 nltk.download('stopwords')
 # Function to Initialize the Model
@@ -301,12 +300,6 @@ def generate_paraphrase(question):
 question = "Following the declaration of the State of Israel in 1948, neighboring Arab states invaded. The war ended with Israel controlling a significant portion of the territory. Many Palestinians became refugees."
-import nltk
-nltk.download('punkt')
-import re
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
 import re
 from nltk.corpus import stopwords
@@ -373,52 +366,25 @@ def find_common_subsequences(sentence, str_list):
     return common_grams
-question = '''the colorado republican party sent a mass email last week with the subject line "god hates pride"'''
-res = generate_paraphrase(question)
-res
-common_grams = find_common_subsequences(question, res[0:3])
-common_grams
-common_gram_words = [word for gram in common_grams for word in gram.split()]
-common_gram_words
 def llm_output(prompt):
-    # sequences = text_generator(prompt)
-    # gen_text = sequences[0]["generated_text"]
-    # sentences = gen_text.split('.')
-    # # first_sentence = get_first_sentence(gen_text[len(prompt):])
-    # return gen_text,sentences[-3]
-    return prompt,prompt
-import re
-import html
 def highlight_phrases_with_colors(sentences, phrases):
-    color_map = {}  # Dictionary to store color assignments for each phrase
-    color_index = 0  # Index to assign colors sequentially
-    # Generate HTML for highlighting each sentence
     highlighted_html = []
     idx = 1
     for sentence in sentences:
         sentence_with_idx = f"{idx}. {sentence}"
         idx += 1
-        highlighted_sentence = html.escape(sentence_with_idx)
         phrase_count = 0
-        # Split sentence into words to apply numbering
         words = re.findall(r'\b\w+\b', sentence)
-        word_index = 1  # Index to track words
-        # Highlight each phrase with a unique color and number
         for phrase in phrases:
             if phrase not in color_map:
-                # Assign a new color if the phrase hasn't been encountered before
                 color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
                 color_index += 1
             escaped_phrase = re.escape(phrase)
             pattern = rf'\b{escaped_phrase}\b'
             highlighted_sentence, num_replacements = re.subn(
@@ -436,34 +402,68 @@ def highlight_phrases_with_colors(sentences, phrases):
             )
             if num_replacements > 0:
                 phrase_count += 1
-                word_index += 1  # Increment word index after each replacement
         highlighted_html.append(highlighted_sentence)
-    # Join sentences with line breaks
     final_html = "<br><br>".join(highlighted_html)
-    # Wrap in a container div for styling
     return f'''
-    <div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 12px;">
-    <h3 style="margin-top: 0; font-size: 1.25em; color: #111827;">Paraphrased And Highlighted Text</h3>
-    <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 12px;">{final_html}</div>
     </div>
     '''
 def model(prompt):
-    generated,sentence = llm_output(prompt)
     res = generate_paraphrase(sentence)
-    common_subs = longest_common_subss(sentence,res)
-#     non_melting  = non_melting_points(sentence, res)
-    common_grams = find_common_subsequences(sentence,res)
-    # common_gram_words = [word for gram in common_grams for word in gram.split()]
     for i in range(len(common_subs)):
         common_subs[i]["Paraphrased Sentence"] = res[i]
-    result = highlight_phrases_with_colors(res,common_grams)
-    return generated, result
-# model(question)
 with gr.Blocks(theme = gr.themes.Monochrome()) as demo:
     gr.Markdown("# Paraphrases the Text and Highlights the Non-melting Points")
@@ -485,13 +485,15 @@ with gr.Blocks(theme = gr.themes.Monochrome()) as demo:
         html_output = gr.HTML()
     with gr.Row():
-      submit_button.click(model, inputs=user_input, outputs=[ai_output, html_output])
-      clear_button.click(lambda: "", inputs=None, outputs=user_input)
-      clear_button.click(lambda: "", inputs=None, outputs=[ai_output, selected_sentence, html_output])
-# Launch the demo
-demo.launch()

     https://colab.research.google.com/drive/1pFGR4uvXMMWVJFQeFmn--arumSxqa5Yy
 """
 from transformers import AutoTokenizer
 from transformers import AutoModelForSeq2SeqLM
 import plotly.graph_objects as go
 import torch
 from transformers import GPT2LMHeadModel
 import seaborn as sns
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM
 # from colorama import Fore, Style
 # import openai
 import random
 import nltk
 from nltk.translate.bleu_score import sentence_bleu
 from transformers import BertTokenizer, BertModel
+import graphviz
+import gradio as gr
 nltk.download('stopwords')
 # Function to Initialize the Model
 question = "Following the declaration of the State of Israel in 1948, neighboring Arab states invaded. The war ended with Israel controlling a significant portion of the territory. Many Palestinians became refugees."
 import re
 from nltk.corpus import stopwords
     return common_grams
 def llm_output(prompt):
+    return prompt, prompt
 def highlight_phrases_with_colors(sentences, phrases):
+    color_map = {}
+    color_index = 0
     highlighted_html = []
     idx = 1
     for sentence in sentences:
         sentence_with_idx = f"{idx}. {sentence}"
         idx += 1
+        highlighted_sentence = sentence_with_idx
         phrase_count = 0
         words = re.findall(r'\b\w+\b', sentence)
+        word_index = 1
         for phrase in phrases:
             if phrase not in color_map:
                 color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
                 color_index += 1
             escaped_phrase = re.escape(phrase)
             pattern = rf'\b{escaped_phrase}\b'
             highlighted_sentence, num_replacements = re.subn(
             )
             if num_replacements > 0:
                 phrase_count += 1
+                word_index += 1
         highlighted_html.append(highlighted_sentence)
     final_html = "<br><br>".join(highlighted_html)
     return f'''
+    <div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 2px;">
+    <h3 style="margin-top: 0; font-size: 1em; color: #111827;">Paraphrased And Highlighted Text</h3>
+    <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 2px;">{final_html}</div>
     </div>
     '''
+# Masking Model
+def mask_non_stopword(sentence):
+    stop_words = set(stopwords.words('english'))
+    words = sentence.split()
+    non_stop_words = [word for word in words if word.lower() not in stop_words]
+    if not non_stop_words:
+        return sentence
+    word_to_mask = random.choice(non_stop_words)
+    masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
+    return masked_sentence
+# Load tokenizer and model for masked language model
+tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
+def mask(sentence):
+    predictions = fill_mask(sentence)
+    masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))]
+    return masked_sentences
+# Function to generate the tree and return the Graphviz source
+def generate_tree(original_sentence: str) -> str:
+    paraphrased_sentences = generate_paraphrase(original_sentence)
+    first_paraphrased_sentence = paraphrased_sentences[0]
+    masked_sentence = mask_non_stopword(first_paraphrased_sentence)
+    masked_versions = mask(masked_sentence)
+    dot = graphviz.Digraph()
+    dot.attr(rankdir='LR', size='8,10!', dpi='72')
+    dot.node("Original", original_sentence)
+    dot.node("Paraphrased", first_paraphrased_sentence)
+    dot.edge("Original", "Paraphrased")
+    for i, masked in enumerate(masked_versions):
+        node_id = f"Masked_{i}"
+        dot.node(node_id, masked)
+        dot.edge("Paraphrased", node_id)
+    return masked_sentence, dot.source
+# Function for the Gradio interface
 def model(prompt):
+    generated, sentence = llm_output(prompt)
     res = generate_paraphrase(sentence)
+    common_subs = longest_common_subss(sentence, res)
+    common_grams = find_common_subsequences(sentence, res)
     for i in range(len(common_subs)):
         common_subs[i]["Paraphrased Sentence"] = res[i]
+    result = highlight_phrases_with_colors(res, common_grams)
+    masked_sentence, tree_source = generate_tree(sentence)
+    graph = graphviz.Source(tree_source)
+    svg_content = graph.pipe(format='svg').decode('utf-8')
+    # tree = f'<div style="width: 100%; overflow-x: auto;">{svg_content}</div>'
+    return generated, generated, result, masked_sentence, svg_content
 with gr.Blocks(theme = gr.themes.Monochrome()) as demo:
     gr.Markdown("# Paraphrases the Text and Highlights the Non-melting Points")
         html_output = gr.HTML()
     with gr.Row():
+        masked_sentence = gr.Textbox(label="Masked Sentence")
+    with gr.Row():
+        tree = gr.HTML(label="Tree")
+    submit_button.click(model, inputs=user_input, outputs=[ai_output, selected_sentence, html_output, masked_sentence, tree])
+    clear_button.click(lambda: "", inputs=None, outputs=user_input)
+    clear_button.click(lambda: "", inputs=None, outputs=[ai_output, selected_sentence, html_output, masked_sentence, tree])
+# Launch the demo
+demo.launch(share=True)