Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
-
"""
|
3 |
|
4 |
Automatically generated by Colab.
|
5 |
|
6 |
Original file is located at
|
7 |
-
https://colab.research.google.com/drive/
|
8 |
"""
|
9 |
|
|
|
10 |
|
11 |
import gradio as gr
|
12 |
|
@@ -299,97 +300,14 @@ def generate_paraphrase(question):
|
|
299 |
res = paraphrase(question, para_tokenizer, para_model)
|
300 |
return res
|
301 |
|
302 |
-
# question = "The official position of the United States on the Russia Ukraine war has been consistent in supporting Ukraine ’s sovereignty , territorial integrity, and the peaceful resolution of the conflict."
|
303 |
-
|
304 |
question = "Following the declaration of the State of Israel in 1948, neighboring Arab states invaded. The war ended with Israel controlling a significant portion of the territory. Many Palestinians became refugees."
|
305 |
|
306 |
-
res = generate_paraphrase(question)
|
307 |
-
|
308 |
-
res
|
309 |
-
|
310 |
-
longest_common_subss(question, res)
|
311 |
-
|
312 |
import nltk
|
313 |
nltk.download('punkt')
|
314 |
-
|
315 |
import re
|
316 |
from nltk.corpus import stopwords
|
317 |
from nltk.tokenize import word_tokenize
|
318 |
|
319 |
-
def non_melting_points(original_sentence, paraphrased_sentences):
|
320 |
-
stop_words = set(stopwords.words('english'))
|
321 |
-
|
322 |
-
def tokenize_and_filter(sentence):
|
323 |
-
words = word_tokenize(sentence.lower())
|
324 |
-
filtered_words = {word for word in words if word.isalpha() and word not in stop_words}
|
325 |
-
return filtered_words
|
326 |
-
original_words = tokenize_and_filter(original_sentence)
|
327 |
-
paraphrased_words_list = [tokenize_and_filter(sentence) for sentence in paraphrased_sentences]
|
328 |
-
common_words = original_words
|
329 |
-
for words in paraphrased_words_list:
|
330 |
-
common_words &= words
|
331 |
-
return common_words
|
332 |
-
|
333 |
-
#Function to get the first sentence from a paragraph
|
334 |
-
|
335 |
-
import re
|
336 |
-
|
337 |
-
def get_first_sentence(paragraph):
|
338 |
-
match = re.search(r'([^.]*\.[\s]*[A-Z])', paragraph)
|
339 |
-
if match:
|
340 |
-
first_sentence = match.group(0)
|
341 |
-
first_sentence = first_sentence.strip()
|
342 |
-
if len(first_sentence.split('.')) > 1:
|
343 |
-
return first_sentence.split('.')[0] + '.'
|
344 |
-
return first_sentence
|
345 |
-
else:
|
346 |
-
return paragraph
|
347 |
-
|
348 |
-
|
349 |
-
#Initializing llama3
|
350 |
-
|
351 |
-
# import json
|
352 |
-
# import torch
|
353 |
-
# from transformers import (AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline)
|
354 |
-
|
355 |
-
# config_data = json.load(open("config.json"))
|
356 |
-
# HF_TOKEN = config_data["HF_TOKEN"]
|
357 |
-
|
358 |
-
# model_name = "meta-llama/Meta-Llama-3-8B"
|
359 |
-
|
360 |
-
# bnb_config = BitsAndBytesConfig(
|
361 |
-
# load_in_4bit=True,
|
362 |
-
# bnb_4bit_use_double_quant=True,
|
363 |
-
# bnb_4bit_quant_type="nf4",
|
364 |
-
# bnb_4bit_compute_dtype=torch.bfloat16
|
365 |
-
# )
|
366 |
-
|
367 |
-
# tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
|
368 |
-
# tokenizer.pad_token = tokenizer.eos_token
|
369 |
-
|
370 |
-
# model = AutoModelForCausalLM.from_pretrained(
|
371 |
-
# model_name,
|
372 |
-
# device_map="auto",
|
373 |
-
# quantization_config=bnb_config,
|
374 |
-
# token=HF_TOKEN
|
375 |
-
# )
|
376 |
-
|
377 |
-
# text_generator = pipeline(
|
378 |
-
# "text-generation",
|
379 |
-
# model=model,
|
380 |
-
# tokenizer=tokenizer,
|
381 |
-
# max_new_tokens=512,
|
382 |
-
# )
|
383 |
-
|
384 |
-
# # llm_result = text_generator("write about nazism")
|
385 |
-
|
386 |
-
# llm_result
|
387 |
-
|
388 |
-
# llm_result[0]["generated_text"].split('.')
|
389 |
-
|
390 |
-
|
391 |
-
#Finds LCS
|
392 |
-
|
393 |
import re
|
394 |
from nltk.corpus import stopwords
|
395 |
|
@@ -467,8 +385,6 @@ common_grams
|
|
467 |
common_gram_words = [word for gram in common_grams for word in gram.split()]
|
468 |
common_gram_words
|
469 |
|
470 |
-
import re
|
471 |
-
|
472 |
def llm_output(prompt):
|
473 |
# sequences = text_generator(prompt)
|
474 |
# gen_text = sequences[0]["generated_text"]
|
@@ -478,45 +394,114 @@ def llm_output(prompt):
|
|
478 |
return prompt,prompt
|
479 |
|
480 |
import re
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
|
498 |
def model(prompt):
|
499 |
generated,sentence = llm_output(prompt)
|
500 |
res = generate_paraphrase(sentence)
|
501 |
common_subs = longest_common_subss(sentence,res)
|
502 |
-
|
503 |
common_grams = find_common_subsequences(sentence,res)
|
504 |
-
common_gram_words = [word for gram in common_grams for word in gram.split()]
|
505 |
for i in range(len(common_subs)):
|
506 |
common_subs[i]["Paraphrased Sentence"] = res[i]
|
507 |
-
result =
|
508 |
return generated, result
|
509 |
|
510 |
-
#
|
511 |
|
512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
|
514 |
-
demo = gr.Interface(
|
515 |
-
fn=model,
|
516 |
-
inputs=gr.Textbox(label="User Prompt"),
|
517 |
-
outputs=[gr.Textbox(label="AI-generated Text (Llama3)"), gr.HTML()],
|
518 |
-
title="Paraphrases the Text and Highlights the Non-melting Points",
|
519 |
-
theme=gr.themes.Soft()
|
520 |
-
)
|
521 |
|
522 |
-
demo.launch(share=True)
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
+
"""text-paraphraser.ipynb
|
3 |
|
4 |
Automatically generated by Colab.
|
5 |
|
6 |
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1pFGR4uvXMMWVJFQeFmn--arumSxqa5Yy
|
8 |
"""
|
9 |
|
10 |
+
!pip install gradio
|
11 |
|
12 |
import gradio as gr
|
13 |
|
|
|
300 |
res = paraphrase(question, para_tokenizer, para_model)
|
301 |
return res
|
302 |
|
|
|
|
|
303 |
question = "Following the declaration of the State of Israel in 1948, neighboring Arab states invaded. The war ended with Israel controlling a significant portion of the territory. Many Palestinians became refugees."
|
304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
import nltk
|
306 |
nltk.download('punkt')
|
|
|
307 |
import re
|
308 |
from nltk.corpus import stopwords
|
309 |
from nltk.tokenize import word_tokenize
|
310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
import re
|
312 |
from nltk.corpus import stopwords
|
313 |
|
|
|
385 |
common_gram_words = [word for gram in common_grams for word in gram.split()]
|
386 |
common_gram_words
|
387 |
|
|
|
|
|
388 |
def llm_output(prompt):
|
389 |
# sequences = text_generator(prompt)
|
390 |
# gen_text = sequences[0]["generated_text"]
|
|
|
394 |
return prompt,prompt
|
395 |
|
396 |
import re
|
397 |
+
import html
|
398 |
+
|
399 |
+
def highlight_phrases_with_colors(sentences, phrases):
|
400 |
+
color_map = {} # Dictionary to store color assignments for each phrase
|
401 |
+
color_index = 0 # Index to assign colors sequentially
|
402 |
+
|
403 |
+
# Generate HTML for highlighting each sentence
|
404 |
+
highlighted_html = []
|
405 |
+
idx = 1
|
406 |
+
for sentence in sentences:
|
407 |
+
sentence_with_idx = f"{idx}. {sentence}"
|
408 |
+
idx += 1
|
409 |
+
highlighted_sentence = html.escape(sentence_with_idx)
|
410 |
+
phrase_count = 0
|
411 |
+
|
412 |
+
# Split sentence into words to apply numbering
|
413 |
+
words = re.findall(r'\b\w+\b', sentence)
|
414 |
+
word_index = 1 # Index to track words
|
415 |
+
|
416 |
+
# Highlight each phrase with a unique color and number
|
417 |
+
for phrase in phrases:
|
418 |
+
if phrase not in color_map:
|
419 |
+
# Assign a new color if the phrase hasn't been encountered before
|
420 |
+
color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
|
421 |
+
color_index += 1
|
422 |
+
|
423 |
+
escaped_phrase = re.escape(phrase)
|
424 |
+
pattern = rf'\b{escaped_phrase}\b'
|
425 |
+
highlighted_sentence, num_replacements = re.subn(
|
426 |
+
pattern,
|
427 |
+
lambda m, count=phrase_count, color=color_map[phrase], index=word_index: (
|
428 |
+
f'<span style="background-color: {color}; font-weight: bold;'
|
429 |
+
f' padding: 2px 4px; border-radius: 2px; position: relative;">'
|
430 |
+
f'<span style="background-color: black; color: white; border-radius: 50%;'
|
431 |
+
f' padding: 2px 5px; margin-right: 5px;">{index}</span>'
|
432 |
+
f'{m.group(0)}'
|
433 |
+
f'</span>'
|
434 |
+
),
|
435 |
+
highlighted_sentence,
|
436 |
+
flags=re.IGNORECASE
|
437 |
+
)
|
438 |
+
if num_replacements > 0:
|
439 |
+
phrase_count += 1
|
440 |
+
word_index += 1 # Increment word index after each replacement
|
441 |
+
|
442 |
+
highlighted_html.append(highlighted_sentence)
|
443 |
+
|
444 |
+
# Join sentences with line breaks
|
445 |
+
final_html = "<br><br>".join(highlighted_html)
|
446 |
+
|
447 |
+
# Wrap in a container div for styling
|
448 |
+
return f'''
|
449 |
+
<div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 12px;">
|
450 |
+
<h3 style="margin-top: 0; font-size: 1.25em; color: #111827;">Paraphrased And Highlighted Text</h3>
|
451 |
+
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 12px;">{final_html}</div>
|
452 |
+
</div>
|
453 |
+
'''
|
454 |
|
455 |
def model(prompt):
|
456 |
generated,sentence = llm_output(prompt)
|
457 |
res = generate_paraphrase(sentence)
|
458 |
common_subs = longest_common_subss(sentence,res)
|
459 |
+
# non_melting = non_melting_points(sentence, res)
|
460 |
common_grams = find_common_subsequences(sentence,res)
|
461 |
+
# common_gram_words = [word for gram in common_grams for word in gram.split()]
|
462 |
for i in range(len(common_subs)):
|
463 |
common_subs[i]["Paraphrased Sentence"] = res[i]
|
464 |
+
result = highlight_phrases_with_colors(res,common_grams)
|
465 |
return generated, result
|
466 |
|
467 |
+
# model(question)
|
468 |
|
469 |
+
with gr.Blocks(theme = gr.themes.Monochrome()) as demo:
|
470 |
+
gr.Markdown("# Paraphrases the Text and Highlights the Non-melting Points")
|
471 |
+
|
472 |
+
with gr.Row():
|
473 |
+
user_input = gr.Textbox(label="User Prompt")
|
474 |
+
|
475 |
+
with gr.Row():
|
476 |
+
submit_button = gr.Button("Submit")
|
477 |
+
clear_button = gr.Button("Clear")
|
478 |
+
|
479 |
+
with gr.Row():
|
480 |
+
ai_output = gr.Textbox(label="AI-generated Text (Llama3)")
|
481 |
+
|
482 |
+
with gr.Row():
|
483 |
+
selected_sentence = gr.Textbox(label="Selected Sentence")
|
484 |
+
|
485 |
+
with gr.Row():
|
486 |
+
html_output = gr.HTML()
|
487 |
+
|
488 |
+
with gr.Row():
|
489 |
+
|
490 |
+
submit_button.click(model, inputs=user_input, outputs=[ai_output, html_output])
|
491 |
+
clear_button.click(lambda: "", inputs=None, outputs=user_input)
|
492 |
+
clear_button.click(lambda: "", inputs=None, outputs=[ai_output, selected_sentence, html_output])
|
493 |
+
|
494 |
+
# Launch the demo
|
495 |
+
demo.launch(share=True)
|
496 |
+
|
497 |
+
# !pip install pyngrok
|
498 |
+
|
499 |
+
# from pyngrok import ngrok, conf
|
500 |
+
# conf.get_default().auth_token = '2hsSp28infbSQYi8Es6O0XxbY8R_4nCeErYLzjdjBMDLcfji'
|
501 |
+
# public_url = ngrok.connect(7861).public_url
|
502 |
+
# print(public_url)
|
503 |
+
|
504 |
+
# demo.queue().launch(server_port=7861, inline=False, share=False, debug=True)
|
505 |
+
# demo.launch(share=True,debug=True,inline = False)
|
506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
|
|