micknikolic's picture
Update app.py
a1dac91
# https://huggingface.co/spaces/micknikolic/pdf-abstract-summarizer
# Here are the imports
import pdfplumber
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from bert_score import score as bert_score
from io import BytesIO
from scipy.io.wavfile import write as write_wav
import gradio as gr
import numpy as np
from gtts import gTTS
# Here is the code
##Instantiating model and tokenizer.
pegasus_research_model = AutoModelForSeq2SeqLM.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv")
pegasus_research_model = pegasus_research_model.to("cuda")
pegasus_research_tokenizer = AutoTokenizer.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv")
##Defining functions.
def extract_abstract(uploaded_file):
with pdfplumber.open(uploaded_file) as pdf:
abstract = ""
for page in pdf.pages:
text = page.extract_text(x_tolerance=1, y_tolerance=1)
if text:
text_lower = text.lower()
if "abstract" in text_lower:
start_index = text_lower.find("abstract")
end_index = text_lower.find("introduction", start_index)
if end_index == -1:
end_index = len(text)
abstract = text[start_index:end_index]
break
return abstract
def text_chunker(text, tokenizer, max_tokens):
tokens = tokenizer.encode(text)
num_chunks = len(tokens) // max_tokens + (len(tokens) % max_tokens > 0)
chunked_tokens = [
tokens[i * max_tokens : (i + 1) * max_tokens] for i in range(num_chunks)
]
chunked_text = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunked_tokens]
return chunked_text
def pegasus_research_summarize(text):
inputs = pegasus_research_tokenizer.encode("summarize: " + text,
return_tensors="pt",
max_length=800,
truncation=True)
summary_ids = pegasus_research_model.generate(inputs.to("cuda"),
max_length=150,
min_length=40,
length_penalty=0.5,
num_beams=4,
early_stopping=True
)
summary = pegasus_research_tokenizer.decode(summary_ids[0],
skip_special_tokens=True)
return summary
def select_best_sentence(summary, reference_text):
sentences = summary.split('.')
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
if not sentences:
return "", "0.00 (Very Low Similarity)"
_, _, f1_scores = bert_score(sentences, [reference_text] * len(sentences), lang="en", rescale_with_baseline=True)
best_sentence_index = np.argmax(f1_scores)
best_sentence = sentences[best_sentence_index]
best_f1_score = round(f1_scores[best_sentence_index].item(), 2)
score_label = ""
if best_f1_score <= 0.20:
score_label = " (Very Low Similarity)"
elif best_f1_score <= 0.40:
score_label = " (Low Similarity)"
elif best_f1_score <= 0.60:
score_label = " (Moderate Similarity)"
elif best_f1_score <= 0.80:
score_label = " (High Similarity)"
else:
score_label = " (Very High Similarity)"
best_f1_score_with_label = f"{best_f1_score}{score_label}"
return best_sentence, best_f1_score_with_label
def convert_to_audio(text):
tts = gTTS(text, lang='en')
buffer = BytesIO()
tts.write_to_fp(buffer)
buffer.seek(0)
audio_bytes = buffer.read()
return audio_bytes
def pr_recursive_summarize(text, reference_text, recursion_l=0):
recursion_level = recursion_l + 1
print(f"Pegasus Research level: {recursion_level}\n")
tokens = pegasus_research_tokenizer.tokenize(text)
expectedCountOfChunks = max(len(tokens) / 150, 1)
max_length = int(len(tokens) / expectedCountOfChunks) + 2
chunks = text_chunker(text, pegasus_research_tokenizer, max_tokens=800)
print(f"Number of chunks: {len(chunks)}")
summaries = []
for i, chunk in enumerate(chunks, 1):
print(f"Chunk no.{i}:")
print(chunk, "\n")
summary = pegasus_research_summarize(chunk)
print("Summary:", summary)
summaries.append(summary)
print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_")
torch.cuda.empty_cache()
concatenated_summary = ' '.join(summaries)
tokens = pegasus_research_tokenizer.tokenize(concatenated_summary)
if len(tokens) > 50 and recursion_level <= 10:
print("Recursive")
return pr_recursive_summarize(concatenated_summary, reference_text, recursion_level)
else:
final_summary = concatenated_summary
if len(chunks) > 1:
final_summary = pegasus_research_summarize(concatenated_summary)
sentences = final_summary.split(".")
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return None, 0.0
p, r, f1_scores = bert_score(sentences, [reference_text]*len(sentences), lang="en")
best_sentence_index = np.argmax(f1_scores)
best_sentence = sentences[best_sentence_index]
best_f1_score = f1_scores[best_sentence_index].item()
return best_sentence, best_f1_score
def summarize_and_convert_to_audio(pdf_file):
abstract_text = extract_abstract(pdf_file)
if not abstract_text:
return "No 'abstract' section found in the uploaded PDF. Please upload a different PDF.", None, "0.00 (Very Low Similarity)"
best_sentence, best_f1_score = pr_recursive_summarize(abstract_text, abstract_text)
audio_bytes = convert_to_audio(best_sentence)
return audio_bytes, best_sentence, f"{best_f1_score:.2f} (Very High Similarity)"
##Building the Gradio UI.
iface = gr.Interface(
fn=summarize_and_convert_to_audio,
inputs=gr.File(label="Upload PDF"),
outputs=[
gr.Audio(label="Audio"),
gr.Textbox(label="Summary sentence"),
gr.Textbox(label="Bert F1-Score")
],
title="PDF Abstract Summarizer and Audio Converter",
description="Upload a PDF file to extract and summarize its 'abstract' section. The best summary sentence based on its Bert F1-score will be converted into speech and the score's interpretation will be displayed. (A PDF file needs to contain the Abstract section.)",
examples=[
"Hidden Technical Debt in Machine Learning Systems.pdf",
"On the Cross-Entropy Method.pdf"
]
)
iface.launch()