from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import pickle import torch from transformers import PegasusTokenizer, PegasusForConditionalGeneration import tensorflow as tf from tensorflow.python.lib.io import file_io from nltk.tokenize import sent_tokenize import io tf.compat.v1.disable_eager_execution() # Let's load the model and the tokenizer model_name = "human-centered-summarization/financial-summarization-pegasus" tokenizer = PegasusTokenizer.from_pretrained(model_name) model2 = PegasusForConditionalGeneration.from_pretrained(model_name) #tokenizer = AutoTokenizer.from_pretrained(checkpoint) #model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) import nltk from finbert_embedding.embedding import FinbertEmbedding import pandas as pd from nltk.cluster import KMeansClusterer import numpy as np import os from scipy.spatial import distance_matrix from tensorflow.python.lib.io import file_io import pickle nltk.download('punkt') def pegasus(text): '''A function to obtain summaries for each tokenized sentence. It returns a summarized document as output''' import nltk nltk.download('punkt') import os data_path = "/tmp/" if not os.path.exists(data_path): os.makedirs(data_path) input_ = "/tmp/input.txt" with open(input_, "w") as file: file.write(text) # read the written txt into a variable with open(input_ , 'r') as f: text_ = f.read() def tokenized_sentences(file): '''A function to generate chunks of sentences and texts. Returns tokenized texts''' # Create empty arrays tokenized_sentences = [] sentences = [] length = 0 for sentence in sent_tokenize(file): length += len(sentence) # 512 is the maximum input length for the Pegasus model if length < 512: sentences.append(sentence) else: tokenized_sentences.append(sentences) sentences = [sentence] length = len(sentence) sentences = [sentence.strip() for sentence in sentences] size = len(sentences) # Append all tokenized sentences if sentences: tokenized_sentences.append(sentences) return tokenized_sentences tokenized = tokenized_sentences(text_) # Use GPU if available device = 'cuda' if torch.cuda.is_available() else 'cpu' global summary # Create an empty array for all summaries summary = [] if size <= 4: max_length= size else: max_length = size//4 # Loop to encode tokens, to generate abstractive summary and finally decode tokens for token in tokenized: # Encoding inputs = tokenizer.encode(' '.join(token), truncation=True, return_tensors='pt') # Use CPU or GPU inputs = inputs.to(device) # Get summaries from transformer model all_summary = model2.to(device).generate(inputs,do_sample=True, max_length=max_length, top_k=50, top_p=0.95, num_beams = 5, early_stopping=True) # num_return_sequences=5) # length_penalty=0.2, no_repeat_ngram_size=2 # min_length=10, # max_length=50) # Decoding output = [tokenizer.decode(each_summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for each_summary in all_summary] # Append each output to array summary.append(output) # Get final summary summary = [sentence for each in summary for sentence in each] final = "".join(summary) return final import gradio as gr interface1 = gr.Interface(fn=pegasus, inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'), outputs=gr.outputs.Textbox(label='Output- Pegasus')).launch()