#importing the necessary library import re import nltk import torch from nltk.tokenize import sent_tokenize nltk.download('punkt') import gradio as gr from gradio.mix import Parallel from transformers import pipeline import numpy as np import math # Defining a function to read in the text file def read_in_text(url): with open(url, 'r') as file: article = file.read() return article def clean_text(url): text = url text = text.encode("ascii", errors="ignore").decode( "ascii" ) # remove non-ascii, Chinese characters text = re.sub(r"\n", " ", text) text = re.sub(r"\n\n", " ", text) text = re.sub(r"\t", " ", text) text = text.strip(" ") text = re.sub( " +", " ", text ).strip() # get rid of multiple spaces and replace with a single return text #initailizing the model pipeline from transformers import BartTokenizer, BartForConditionalGeneration model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6") tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6") #Defining a function to get the summary of the article def final_summary(file): #reading in the text and tokenizing it into sentence text = clean_text(file) chunks = sent_tokenize(text) output = [] sentences_remaining = len(chunks) #looping through the sentences in a batch of 10 and summarizing them i=0 while (sentences_remaining > 0): chunks_remaining = math.ceil(sentences_remaining / 10.0) next_chunk_size = math.ceil(sentences_remaining / chunks_remaining) sentence = ' '.join(chunks[i:i+(next_chunk_size)]) i += next_chunk_size sentences_remaining -= next_chunk_size inputs = tokenizer(sentence, return_tensors="pt", padding='longest') if (len(inputs['input_ids'][0])) < 150: output.append(sentence) elif (len(inputs['input_ids'][0])) > 1024: sent = sent_tokenize(sentence) length_sent = len(sent) j=0 sent_remaining = math.ceil(length_sent / 2) while length_sent >0: #next_sent_size = math.ceil(length_sent / sent_remaining) halved_sentence = ' '.join(sent[j:j+(sent_remaining)]) inputs = tokenizer(halved_sentence, return_tensors="pt") summary_ids = model.generate(inputs["input_ids"]) j += sent_remaining length_sent -= sent_remaining if (len(summary_ids[0])) < (len(inputs['input_ids'][0])): summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] output.append(summary) else: continue else: summary_ids = model.generate(inputs["input_ids"]) if (len(summary_ids[0])) < (len(inputs['input_ids'][0])): summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] output.append(summary) else: continue #joining all the summary output together summary = ' '.join(output) lines1 = sent_tokenize(summary) for i in range(len(lines1)): lines1[i] = "* " + lines1[i].strip().replace(' .', '.') summ_bullet1 = "\n".join(lines1) return summ_bullet1 #creating an interface for the headline generator using gradio demo = gr.Interface(final_summary, inputs=[gr.inputs.Textbox(label="Drop your article here", optional=False)], title = "ARTICLE SUMMARIZER", outputs=[gr.outputs.Textbox(label="Summary")], theme= "darkhuggingface") #launching the app if __name__ == "__main__": demo.launch(debug=True)