# my space: https://huggingface.co/spaces/vividsd/practice # I tried to use my previous code but with some adaptions to any PDF that contains an abstract import gradio as gr from transformers import pipeline from tempfile import NamedTemporaryFile import PyPDF2 from PyPDF2 import PdfReader from pdfminer.high_level import extract_pages, extract_text import pdfplumber from PIL import Image from pdf2image import convert_from_path from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure import pytesseract import os import numpy as np import torch import sentencepiece import soundfile as sf from IPython.display import Audio from datasets import load_dataset from transformers import SpeechT5HifiGan def read_pdf(pdf_path): # create a PDF file object pdfFileObj = open(pdf_path, 'rb') # create a PDF reader object pdfReaded = PyPDF2.PdfReader(pdfFileObj) # Create the dictionary to extract text from each image text_per_page = {} # We extract the pages from the PDF for pagenum, page in enumerate(extract_pages(pdf_path)): print("Elaborating Page_" +str(pagenum)) # Initialize the variables needed for the text extraction from the page pageObj = pdfReaded.pages[pagenum] page_text = [] line_format = [] text_from_images = [] text_from_tables = [] page_content = [] # Initialize the number of the examined tables table_num = 0 first_element= True table_extraction_flag= False # Open the pdf file pdf = pdfplumber.open(pdf_path) # Find the examined page page_tables = pdf.pages[pagenum] # Find the number of tables on the page tables = page_tables.find_tables() # Find all the elements page_elements = [(element.y1, element) for element in page._objs] # Sort all the elements as they appear in the page page_elements.sort(key=lambda a: a[0], reverse=True) # Find the elements that composed a page for i,component in enumerate(page_elements): # Extract the position of the top side of the element in the PDF pos= component[0] # Extract the element of the page layout element = component[1] # Check if the element is a text element if isinstance(element, LTTextContainer): # Check if the text appeared in a table if table_extraction_flag == False: # Use the function to extract the text and format for each text element (line_text, format_per_line) = text_extraction(element) # Append the text of each line to the page text page_text.append(line_text) # Append the format for each line containing text line_format.append(format_per_line) page_content.append(line_text) else: # Omit the text that appeared in a table pass # Check the elements for images if isinstance(element, LTFigure): # Crop the image from the PDF crop_image(element, pageObj) # Convert the cropped pdf to an image convert_to_images('cropped_image.pdf') # Extract the text from the image image_text = image_to_text('PDF_image.png') text_from_images.append(image_text) page_content.append(image_text) # Add a placeholder in the text and format lists page_text.append('image') line_format.append('image') # Check the elements for tables if isinstance(element, LTRect): # If the first rectangular element if first_element == True and (table_num+1) <= len(tables): # Find the bounding box of the table lower_side = page.bbox[3] - tables[table_num].bbox[3] upper_side = element.y1 # Extract the information from the table table = extract_table(pdf_path, pagenum, table_num) # Convert the table information in structured string format table_string = table_converter(table) # Append the table string into a list text_from_tables.append(table_string) page_content.append(table_string) # Set the flag as True to avoid the content again table_extraction_flag = True # Make it another element first_element = False # Add a placeholder in the text and format lists page_text.append('table') line_format.append('table') # Check if we already extracted the tables from the page if element.y0 >= lower_side and element.y1 <= upper_side: pass elif not isinstance(page_elements[i+1][1], LTRect): table_extraction_flag = False first_element = True table_num+=1 # Create the key of the dictionary dctkey = 'Page_'+str(pagenum) # Add the list of list as the value of the page key text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content] # Closing the pdf file object pdfFileObj.close() return text_per_page pdf_path = pdf_file.name text_per_page = read_pdf(pdf_path) page_0 = text_per_page['Page_0'] page_0_clean = [item for sublist in page_0 for item in sublist if isinstance(item, str)] for i in range(len(page_0_clean)): page_0_clean[i] = page_0_clean[i].replace('\n', ' ').strip() #intead of cleaning the exact position as I did in my previous code, since I don't know it, then I try to identify the section of the abstract abstract = 'abstract' found_abstract = False intro_string ='introduction' extracted_abstract ="" extracted_abstract = extracted_text_string.replace("Abstract", "") file = text.splitlines() for lines in file: lower_lines = lines.lower() if lower_lines.strip()== abstract: found_abstract = True elif "1" in lower_lines.strip() and intro_string in lower_lines.strip(): found_abstract = False #summarizing the abstract from transformers import pipeline summarizer = pipeline("summarization", model="Falconsai/text_summarization") text1 = extracted_abstract print(summarizer(text1, max_length=20, min_length=10, do_sample=False)) #in here, I try to save it differently, since on my previous code I had copied and pasted the summary and in here I don't know sentence = summarized_text[0]['summary_text'] # generating the audio of the output by using my previous code from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") text = sentence inputs = processor(text=sentence, return_tensors="pt") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") with torch.no_grad(): speech = vocoder(spectrogram) speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) Audio(speech, rate=16000) # Creating the Gradio app input_component = gr.File(file_types=["pdf"]) output_component = gr.Audio() demo = gr.Interface( fn=read_pdf, inputs=input_component, outputs=output_component, title="Reading your abstract summary outloud", description="Upload a PDF that contains an Abstract. Get your abstract summarized in 1 sentence and read outloud. We only accept with PDfs that contains the section Abstract" ) demo.launch()