# -*- coding: utf-8 -*- """Ai Re-Phraser.py Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/18bvmXQqMIkk7G0gY_1dUolI08RK6Ajrf """ # importing the libraries import os import pandas as pd from parrot import Parrot #Parrot offers knobs to control Adequacy, Fluency and Diversity metrics import torch import warnings import nltk import warnings warnings.filterwarnings("ignore") from sentence_splitter import SentenceSplitter, split_text_into_sentences splitter = SentenceSplitter(language='en') from parrot import Parrot from transformers import PegasusForConditionalGeneration, PegasusTokenizer from transformers import AutoTokenizer from transformers import AutoModelForSeq2SeqLM from parrot.filters import Adequacy from parrot.filters import Fluency from parrot.filters import Diversity # Adding the metrics adequacy_score = Adequacy() fluency_score = Fluency() diversity_score= Diversity() device= "cuda:0" adequacy_threshold = 0.99 fluency_threshold = 0.90 # Fluency (Is the paraphrase fluent English?) diversity_ranker="levenshtein" do_diverse=True # Diversity (Lexical / Phrasal / Syntactical) (How much has the paraphrase changed the original sentence?) #num_beam_groups (int) — Number of groups to divide num_beams into in order to ensure diversity among different groups of beams # adding the model model_name = 'tuner007/pegasus_paraphrase' torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model_pegasus = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device) def get_max_str(lst): return max(lst, key=len) def get_response(input_text,num_return_sequences=10,num_beams=10): batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=90, return_tensors='pt').to(torch_device) translated = model_pegasus.generate(**batch,max_length=90,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5) #num_beam_groups=num_beams, diversity_penalty=0.5 tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) try: adequacy_filtered_phrases = adequacy_score.filter(input_text,tgt_text, adequacy_threshold, device) if len(adequacy_filtered_phrases) > 0 : fluency_filtered_phrases = fluency_score.filter(adequacy_filtered_phrases, fluency_threshold, device ) if len(fluency_filtered_phrases) > 0 : diversity_scored_phrases = diversity_score.rank(input_text, fluency_filtered_phrases, diversity_ranker) return get_max_str(diversity_scored_phrases) else: return get_max_str(fluency_filtered_phrases) else: return get_max_str(adequacy_filtered_phrases) except: return(get_max_str(tgt_text)) # Deploying the model import gradio as gr def get_fun(txt): tokens = splitter.split(text=txt) txt_paraphrase='' for phrase in tokens: tmp=get_response(phrase,num_return_sequences=30,num_beams=30) txt_paraphrase=txt_paraphrase+' '+tmp return txt_paraphrase iface = gr.Interface(fn=get_fun, inputs="text", outputs="text", title = " Ai Re-Phraser Q'Hackday") iface.launch(inline=False) """# New Section"""