Spaces:
Sleeping
Sleeping
File size: 6,741 Bytes
87f89c1 0d42948 87f89c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# https://huggingface.co/spaces/micknikolic/pdf-abstract-summarizer
# Here are the imports
import pdfplumber
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from bert_score import score as bert_score
from io import BytesIO
from scipy.io.wavfile import write as write_wav
import gradio as gr
import numpy as np
from gtts import gTTS
# Here is the code
##Instantiating model and tokenizer.
pegasus_research_model = AutoModelForSeq2SeqLM.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv")
pegasus_research_model = pegasus_research_model.to("cuda")
pegasus_research_tokenizer = AutoTokenizer.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv")
##Defining functions.
def extract_abstract(uploaded_file):
with pdfplumber.open(uploaded_file) as pdf:
abstract = ""
for page in pdf.pages:
text = page.extract_text(x_tolerance=1, y_tolerance=1)
if text:
text_lower = text.lower()
if "abstract" in text_lower:
start_index = text_lower.find("abstract")
end_index = text_lower.find("introduction", start_index)
if end_index == -1:
end_index = len(text)
abstract = text[start_index:end_index]
break
return abstract
def text_chunker(text, tokenizer, max_tokens):
tokens = tokenizer.encode(text)
num_chunks = len(tokens) // max_tokens + (len(tokens) % max_tokens > 0)
chunked_tokens = [
tokens[i * max_tokens : (i + 1) * max_tokens] for i in range(num_chunks)
]
chunked_text = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunked_tokens]
return chunked_text
def pegasus_research_summarize(text):
inputs = pegasus_research_tokenizer.encode("summarize: " + text,
return_tensors="pt",
max_length=800,
truncation=True)
summary_ids = pegasus_research_model.generate(inputs.to("cuda"),
max_length=150,
min_length=40,
length_penalty=0.5,
num_beams=4,
early_stopping=True
)
summary = pegasus_research_tokenizer.decode(summary_ids[0],
skip_special_tokens=True)
return summary
def select_best_sentence(summary, reference_text):
sentences = summary.split('.')
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
if not sentences:
return "", "0.00 (Very Low Similarity)"
_, _, f1_scores = bert_score(sentences, [reference_text] * len(sentences), lang="en", rescale_with_baseline=True)
best_sentence_index = np.argmax(f1_scores)
best_sentence = sentences[best_sentence_index]
best_f1_score = round(f1_scores[best_sentence_index].item(), 2)
score_label = ""
if best_f1_score <= 0.20:
score_label = " (Very Low Similarity)"
elif best_f1_score <= 0.40:
score_label = " (Low Similarity)"
elif best_f1_score <= 0.60:
score_label = " (Moderate Similarity)"
elif best_f1_score <= 0.80:
score_label = " (High Similarity)"
else:
score_label = " (Very High Similarity)"
best_f1_score_with_label = f"{best_f1_score}{score_label}"
return best_sentence, best_f1_score_with_label
def convert_to_audio(text):
tts = gTTS(text, lang='en')
buffer = BytesIO()
tts.write_to_fp(buffer)
buffer.seek(0)
audio_bytes = buffer.read()
return audio_bytes
def pr_recursive_summarize(text, reference_text, recursion_l=0):
recursion_level = recursion_l + 1
print(f"Pegasus Research level: {recursion_level}\n")
tokens = pegasus_research_tokenizer.tokenize(text)
expectedCountOfChunks = max(len(tokens) / 150, 1)
max_length = int(len(tokens) / expectedCountOfChunks) + 2
chunks = text_chunker(text, pegasus_research_tokenizer, max_tokens=800)
print(f"Number of chunks: {len(chunks)}")
summaries = []
for i, chunk in enumerate(chunks, 1):
print(f"Chunk no.{i}:")
print(chunk, "\n")
summary = pegasus_research_summarize(chunk)
print("Summary:", summary)
summaries.append(summary)
print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_")
torch.cuda.empty_cache()
concatenated_summary = ' '.join(summaries)
tokens = pegasus_research_tokenizer.tokenize(concatenated_summary)
if len(tokens) > 50 and recursion_level <= 10:
print("Recursive")
return pr_recursive_summarize(concatenated_summary, reference_text, recursion_level)
else:
final_summary = concatenated_summary
if len(chunks) > 1:
final_summary = pegasus_research_summarize(concatenated_summary)
sentences = final_summary.split(".")
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return None, 0.0
p, r, f1_scores = bert_score(sentences, [reference_text]*len(sentences), lang="en")
best_sentence_index = np.argmax(f1_scores)
best_sentence = sentences[best_sentence_index]
best_f1_score = f1_scores[best_sentence_index].item()
return best_sentence, best_f1_score
def summarize_and_convert_to_audio(pdf_file):
abstract_text = extract_abstract(pdf_file)
if not abstract_text:
return "No 'abstract' section found in the uploaded PDF. Please upload a different PDF.", None, "0.00 (Very Low Similarity)"
best_sentence, best_f1_score = pr_recursive_summarize(abstract_text, abstract_text)
audio_bytes = convert_to_audio(best_sentence)
return audio_bytes, best_sentence, f"{best_f1_score:.2f} (Very High Similarity)"
##Building the Gradio UI.
iface = gr.Interface(
fn=summarize_and_convert_to_audio,
inputs=gr.File(label="Upload PDF"),
outputs=[
gr.Audio(label="Audio"),
gr.Textbox(label="Summary sentence"),
gr.Textbox(label="Bert F1-Score")
],
title="PDF Abstract Summarizer and Audio Converter",
description="Upload a PDF file to extract and summarize its 'abstract' section. The best summary sentence based on its Bert F1-score will be converted into speech and the score's interpretation will be displayed. (A PDF file needs to contain the Abstract section.)"
)
iface.launch() |