Spaces:
Runtime error
Runtime error
| # https://huggingface.co/spaces/micknikolic/pdf-abstract-summarizer | |
| # Here are the imports | |
| import pdfplumber | |
| import torch | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| from bert_score import score as bert_score | |
| from io import BytesIO | |
| from scipy.io.wavfile import write as write_wav | |
| import gradio as gr | |
| import numpy as np | |
| from gtts import gTTS | |
| # Here is the code | |
| ##Instantiating model and tokenizer. | |
| pegasus_research_model = AutoModelForSeq2SeqLM.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv") | |
| pegasus_research_model = pegasus_research_model.to("cuda") | |
| pegasus_research_tokenizer = AutoTokenizer.from_pretrained("UNIST-Eunchan/Research-Paper-Summarization-Pegasus-x-ArXiv") | |
| ##Defining functions. | |
| def extract_abstract(uploaded_file): | |
| with pdfplumber.open(uploaded_file) as pdf: | |
| abstract = "" | |
| for page in pdf.pages: | |
| text = page.extract_text(x_tolerance=1, y_tolerance=1) | |
| if text: | |
| text_lower = text.lower() | |
| if "abstract" in text_lower: | |
| start_index = text_lower.find("abstract") | |
| end_index = text_lower.find("introduction", start_index) | |
| if end_index == -1: | |
| end_index = len(text) | |
| abstract = text[start_index:end_index] | |
| break | |
| return abstract | |
| def text_chunker(text, tokenizer, max_tokens): | |
| tokens = tokenizer.encode(text) | |
| num_chunks = len(tokens) // max_tokens + (len(tokens) % max_tokens > 0) | |
| chunked_tokens = [ | |
| tokens[i * max_tokens : (i + 1) * max_tokens] for i in range(num_chunks) | |
| ] | |
| chunked_text = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunked_tokens] | |
| return chunked_text | |
| def pegasus_research_summarize(text): | |
| inputs = pegasus_research_tokenizer.encode("summarize: " + text, | |
| return_tensors="pt", | |
| max_length=800, | |
| truncation=True) | |
| summary_ids = pegasus_research_model.generate(inputs.to("cuda"), | |
| max_length=150, | |
| min_length=40, | |
| length_penalty=0.5, | |
| num_beams=4, | |
| early_stopping=True | |
| ) | |
| summary = pegasus_research_tokenizer.decode(summary_ids[0], | |
| skip_special_tokens=True) | |
| return summary | |
| def select_best_sentence(summary, reference_text): | |
| sentences = summary.split('.') | |
| sentences = [sentence.strip() for sentence in sentences if sentence.strip()] | |
| if not sentences: | |
| return "", "0.00 (Very Low Similarity)" | |
| _, _, f1_scores = bert_score(sentences, [reference_text] * len(sentences), lang="en", rescale_with_baseline=True) | |
| best_sentence_index = np.argmax(f1_scores) | |
| best_sentence = sentences[best_sentence_index] | |
| best_f1_score = round(f1_scores[best_sentence_index].item(), 2) | |
| score_label = "" | |
| if best_f1_score <= 0.20: | |
| score_label = " (Very Low Similarity)" | |
| elif best_f1_score <= 0.40: | |
| score_label = " (Low Similarity)" | |
| elif best_f1_score <= 0.60: | |
| score_label = " (Moderate Similarity)" | |
| elif best_f1_score <= 0.80: | |
| score_label = " (High Similarity)" | |
| else: | |
| score_label = " (Very High Similarity)" | |
| best_f1_score_with_label = f"{best_f1_score}{score_label}" | |
| return best_sentence, best_f1_score_with_label | |
| def convert_to_audio(text): | |
| tts = gTTS(text, lang='en') | |
| buffer = BytesIO() | |
| tts.write_to_fp(buffer) | |
| buffer.seek(0) | |
| audio_bytes = buffer.read() | |
| return audio_bytes | |
| def pr_recursive_summarize(text, reference_text, recursion_l=0): | |
| recursion_level = recursion_l + 1 | |
| print(f"Pegasus Research level: {recursion_level}\n") | |
| tokens = pegasus_research_tokenizer.tokenize(text) | |
| expectedCountOfChunks = max(len(tokens) / 150, 1) | |
| max_length = int(len(tokens) / expectedCountOfChunks) + 2 | |
| chunks = text_chunker(text, pegasus_research_tokenizer, max_tokens=800) | |
| print(f"Number of chunks: {len(chunks)}") | |
| summaries = [] | |
| for i, chunk in enumerate(chunks, 1): | |
| print(f"Chunk no.{i}:") | |
| print(chunk, "\n") | |
| summary = pegasus_research_summarize(chunk) | |
| print("Summary:", summary) | |
| summaries.append(summary) | |
| print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_") | |
| torch.cuda.empty_cache() | |
| concatenated_summary = ' '.join(summaries) | |
| tokens = pegasus_research_tokenizer.tokenize(concatenated_summary) | |
| if len(tokens) > 50 and recursion_level <= 10: | |
| print("Recursive") | |
| return pr_recursive_summarize(concatenated_summary, reference_text, recursion_level) | |
| else: | |
| final_summary = concatenated_summary | |
| if len(chunks) > 1: | |
| final_summary = pegasus_research_summarize(concatenated_summary) | |
| sentences = final_summary.split(".") | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if not sentences: | |
| return None, 0.0 | |
| p, r, f1_scores = bert_score(sentences, [reference_text]*len(sentences), lang="en") | |
| best_sentence_index = np.argmax(f1_scores) | |
| best_sentence = sentences[best_sentence_index] | |
| best_f1_score = f1_scores[best_sentence_index].item() | |
| return best_sentence, best_f1_score | |
| def summarize_and_convert_to_audio(pdf_file): | |
| abstract_text = extract_abstract(pdf_file) | |
| if not abstract_text: | |
| return "No 'abstract' section found in the uploaded PDF. Please upload a different PDF.", None, "0.00 (Very Low Similarity)" | |
| best_sentence, best_f1_score = pr_recursive_summarize(abstract_text, abstract_text) | |
| audio_bytes = convert_to_audio(best_sentence) | |
| return audio_bytes, best_sentence, f"{best_f1_score:.2f} (Very High Similarity)" | |
| ##Building the Gradio UI. | |
| iface = gr.Interface( | |
| fn=summarize_and_convert_to_audio, | |
| inputs=gr.File(label="Upload PDF"), | |
| outputs=[ | |
| gr.Audio(label="Audio"), | |
| gr.Textbox(label="Summary sentence"), | |
| gr.Textbox(label="Bert F1-Score") | |
| ], | |
| title="PDF Abstract Summarizer and Audio Converter", | |
| description="Upload a PDF file to extract and summarize its 'abstract' section. The best summary sentence based on its Bert F1-score will be converted into speech and the score's interpretation will be displayed. (A PDF file needs to contain the Abstract section.)", | |
| examples=[ | |
| "Hidden Technical Debt in Machine Learning Systems.pdf", | |
| "On the Cross-Entropy Method.pdf" | |
| ] | |
| ) | |
| iface.launch() |