|
|
|
import re |
|
import nltk |
|
import spacy |
|
import math |
|
from nltk.tokenize import sent_tokenize |
|
nltk.download('punkt') |
|
from transformers import pipeline |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
import gradio as gr |
|
|
|
|
|
def read_in_text(url): |
|
with open(url, 'r') as file: |
|
article = file.read() |
|
return article |
|
|
|
def clean_text(url): |
|
text = url |
|
text = text.encode("ascii", errors="ignore").decode( |
|
"ascii" |
|
) |
|
|
|
text = re.sub(r"\n", " ", text) |
|
text = re.sub(r"\n\n", " ", text) |
|
text = re.sub(r"\t", " ", text) |
|
text = text.strip(" ") |
|
text = re.sub( |
|
" +", " ", text |
|
).strip() |
|
return text |
|
|
|
from transformers import BartTokenizer, BartForConditionalGeneration |
|
|
|
model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6") |
|
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6") |
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
def final_summary(file): |
|
|
|
text = file |
|
bullet_points = 10 |
|
|
|
while (bullet_points >= 10): |
|
|
|
chunks = [] |
|
sentences = nlp(text) |
|
for sentence in sentences.sents: |
|
chunks.append(str(sentence)) |
|
|
|
output = [] |
|
sentences_remaining = len(chunks) |
|
i = 0 |
|
|
|
|
|
while sentences_remaining > 0: |
|
chunks_remaining = math.ceil(sentences_remaining / 10.0) |
|
next_chunk_size = math.ceil(sentences_remaining / chunks_remaining) |
|
sentence = "".join(chunks[i:i+next_chunk_size]) |
|
|
|
i += next_chunk_size |
|
sentences_remaining -= next_chunk_size |
|
|
|
inputs = tokenizer(sentence, return_tensors="pt", padding="longest") |
|
|
|
original_input_length = len(inputs["input_ids"][0]) |
|
|
|
|
|
if original_input_length < 100: |
|
split_sentences = nlp(sentence) |
|
for split_sentence in split_sentences.sents: |
|
output.append(str(split_sentence).rstrip(".")) |
|
|
|
|
|
|
|
elif original_input_length > 1024: |
|
sent = sent_tokenize(sentence) |
|
length_sent = len(sent) |
|
|
|
j = 0 |
|
sent_remaining = math.ceil(length_sent / 2) |
|
|
|
|
|
while length_sent > 0: |
|
halved_sentence = "".join(sent[j:j+sent_remaining]) |
|
halved_inputs = tokenizer(halved_sentence, return_tensors="pt") |
|
|
|
halved_summary_ids = model.generate(halved_inputs["input_ids"]) |
|
j += sent_remaining |
|
length_sent -= sent_remaining |
|
|
|
|
|
if len(halved_summary_ids[0]) < len(halved_inputs["input_ids"][0]): |
|
halved_summary = tokenizer.batch_decode(halved_summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
|
output.append(halved_summary) |
|
|
|
else: |
|
summary_ids = model.generate(inputs["input_ids"]) |
|
|
|
if len(summary_ids[0]) < original_input_length: |
|
summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
|
output.append(summary) |
|
|
|
final_output = [] |
|
for paragraphs in output: |
|
lines = paragraphs.split(" . ") |
|
for line in lines: |
|
final_output.append(line.replace(" .", "").strip()) |
|
text = ".".join(final_output) |
|
bullet_points = len(final_output) |
|
|
|
|
|
for i in range(len(final_output)): |
|
final_output[i] = "* " + final_output[i] + "." |
|
|
|
|
|
summary_bullet = "\n".join(final_output) |
|
|
|
return summary_bullet |
|
|
|
|
|
|
|
|
|
demo = gr.Interface(final_summary, inputs=[gr.inputs.Textbox(label="Drop your article here", optional=False)], |
|
title = "ARTICLE SUMMARIZER", |
|
outputs=[gr.outputs.Textbox(label="Summary")], |
|
theme= "darkhuggingface") |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |