# https://huggingface.co/spaces/micknikolic/pdf-abstract-summarizer # Here are the imports import pdfplumber import torch from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from bert_score import score as bert_score from io import BytesIO from scipy.io.wavfile import write as write_wav import gradio as gr import numpy as np from gtts import gTTS # Here is the code ##Instantiating model and tokenizer. pegasus_research_model = AutoModelForSeq2SeqLM.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv") pegasus_research_model = pegasus_research_model.to("cuda") pegasus_research_tokenizer = AutoTokenizer.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv") ##Defining functions. def extract_abstract(uploaded_file): with pdfplumber.open(uploaded_file) as pdf: abstract = "" for page in pdf.pages: text = page.extract_text(x_tolerance=1, y_tolerance=1) if text: text_lower = text.lower() if "abstract" in text_lower: start_index = text_lower.find("abstract") end_index = text_lower.find("introduction", start_index) if end_index == -1: end_index = len(text) abstract = text[start_index:end_index] break return abstract def text_chunker(text, tokenizer, max_tokens): tokens = tokenizer.encode(text) num_chunks = len(tokens) // max_tokens + (len(tokens) % max_tokens > 0) chunked_tokens = [ tokens[i * max_tokens : (i + 1) * max_tokens] for i in range(num_chunks) ] chunked_text = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunked_tokens] return chunked_text def pegasus_research_summarize(text): inputs = pegasus_research_tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=800, truncation=True) summary_ids = pegasus_research_model.generate(inputs.to("cuda"), max_length=150, min_length=40, length_penalty=0.5, num_beams=4, early_stopping=True ) summary = pegasus_research_tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def select_best_sentence(summary, reference_text): sentences = summary.split('.') sentences = [sentence.strip() for sentence in sentences if sentence.strip()] if not sentences: return "", "0.00 (Very Low Similarity)" _, _, f1_scores = bert_score(sentences, [reference_text] * len(sentences), lang="en", rescale_with_baseline=True) best_sentence_index = np.argmax(f1_scores) best_sentence = sentences[best_sentence_index] best_f1_score = round(f1_scores[best_sentence_index].item(), 2) score_label = "" if best_f1_score <= 0.20: score_label = " (Very Low Similarity)" elif best_f1_score <= 0.40: score_label = " (Low Similarity)" elif best_f1_score <= 0.60: score_label = " (Moderate Similarity)" elif best_f1_score <= 0.80: score_label = " (High Similarity)" else: score_label = " (Very High Similarity)" best_f1_score_with_label = f"{best_f1_score}{score_label}" return best_sentence, best_f1_score_with_label def convert_to_audio(text): tts = gTTS(text, lang='en') buffer = BytesIO() tts.write_to_fp(buffer) buffer.seek(0) audio_bytes = buffer.read() return audio_bytes def pr_recursive_summarize(text, reference_text, recursion_l=0): recursion_level = recursion_l + 1 print(f"Pegasus Research level: {recursion_level}\n") tokens = pegasus_research_tokenizer.tokenize(text) expectedCountOfChunks = max(len(tokens) / 150, 1) max_length = int(len(tokens) / expectedCountOfChunks) + 2 chunks = text_chunker(text, pegasus_research_tokenizer, max_tokens=800) print(f"Number of chunks: {len(chunks)}") summaries = [] for i, chunk in enumerate(chunks, 1): print(f"Chunk no.{i}:") print(chunk, "\n") summary = pegasus_research_summarize(chunk) print("Summary:", summary) summaries.append(summary) print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_") torch.cuda.empty_cache() concatenated_summary = ' '.join(summaries) tokens = pegasus_research_tokenizer.tokenize(concatenated_summary) if len(tokens) > 50 and recursion_level <= 10: print("Recursive") return pr_recursive_summarize(concatenated_summary, reference_text, recursion_level) else: final_summary = concatenated_summary if len(chunks) > 1: final_summary = pegasus_research_summarize(concatenated_summary) sentences = final_summary.split(".") sentences = [s.strip() for s in sentences if s.strip()] if not sentences: return None, 0.0 p, r, f1_scores = bert_score(sentences, [reference_text]*len(sentences), lang="en") best_sentence_index = np.argmax(f1_scores) best_sentence = sentences[best_sentence_index] best_f1_score = f1_scores[best_sentence_index].item() return best_sentence, best_f1_score def summarize_and_convert_to_audio(pdf_file): abstract_text = extract_abstract(pdf_file) if not abstract_text: return "No 'abstract' section found in the uploaded PDF. Please upload a different PDF.", None, "0.00 (Very Low Similarity)" best_sentence, best_f1_score = pr_recursive_summarize(abstract_text, abstract_text) audio_bytes = convert_to_audio(best_sentence) return audio_bytes, best_sentence, f"{best_f1_score:.2f} (Very High Similarity)" ##Building the Gradio UI. iface = gr.Interface( fn=summarize_and_convert_to_audio, inputs=gr.File(label="Upload PDF"), outputs=[ gr.Audio(label="Audio"), gr.Textbox(label="Summary sentence"), gr.Textbox(label="Bert F1-Score") ], title="PDF Abstract Summarizer and Audio Converter", description="Upload a PDF file to extract and summarize its 'abstract' section. The best summary sentence based on its Bert F1-score will be converted into speech and the score's interpretation will be displayed. (A PDF file needs to contain the Abstract section.)", examples=[ "Hidden Technical Debt in Machine Learning Systems.pdf", "On the Cross-Entropy Method.pdf" ] ) iface.launch()