# Here are the imports import PyPDF2 import re import torch from transformers import pipeline from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub from fairseq.models.text_to_speech.hub_interface import TTSHubInterface import gradio as gr import io import numpy as np import soundfile as sf import tempfile # Here is the code # Function to extract and clean abstract from PDF def extract_and_clean_abstract(uploaded_file): if uploaded_file is None: return "No file uploaded." # Read the file using its temporary file path with open(uploaded_file.name, 'rb') as file: reader = PyPDF2.PdfReader(file) full_text = "" for page in reader.pages: full_text += page.extract_text() # Find the abstract pattern = r"(Abstract|ABSTRACT|abstract)(.*?)(Introduction|INTRODUCTION|introduction|1|Keywords|KEYWORDS|keywords)" match = re.search(pattern, full_text, re.DOTALL) if match: abstract = match.group(2).strip() else: return "Abstract not found." # Clean the abstract cleaned_abstract = abstract.replace('\n', ' ').replace('- ', '') return cleaned_abstract # Function to summarize text def summarize_text(text): # Initialize the summarization pipeline with the summarization model summarizer = pipeline( "summarization", "pszemraj/led-base-book-summary", device=0 if torch.cuda.is_available() else -1, ) # Generate the summary result = summarizer( text, min_length=8, max_length=25, no_repeat_ngram_size=3, encoder_no_repeat_ngram_size=3, repetition_penalty=3.5, num_beams=4, do_sample=False, early_stopping=True, ) # Extract the first sentence from the summary first_sentence = re.split(r'(?<=[.:;!?])\s', result[0]['summary_text'])[0] return first_sentence # Function for text-to-speech def text_to_speech(text): # Check if CUDA is available and set the device accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load the TTS model and task from Hugging Face Hub models, cfg, task = load_model_ensemble_and_task_from_hf_hub( "facebook/fastspeech2-en-ljspeech", # Or another TTS model of your choice arg_overrides={"vocoder": "hifigan", "fp16": False} ) # Ensure the model is on the correct device model = models[0].to(device) # Update the config with the data config from the task TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg) # Build the generator generator = task.build_generator([model], cfg) # Get the model input from the text sample = TTSHubInterface.get_model_input(task, text) sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].to(device) sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].to(device) # Generate the waveform wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample) # Move the waveform to CPU if it's on GPU if wav.is_cuda: wav = wav.cpu() # Write the waveform to a temporary file and return the file path with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: sf.write(tmp_file.name, wav.numpy(), rate) return tmp_file.name def process_pdf(uploaded_file): """ Process the uploaded PDF file to extract, summarize the abstract, and convert it to speech. """ abstract = extract_and_clean_abstract(uploaded_file) summary = summarize_text(abstract) audio_output = text_to_speech(summary) return audio_output # Create Gradio interface iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload PDF"), outputs=gr.Audio(label="Audio Summary"), title="PDF Abstract Summary to Speech", description="Upload only a PDF file that has an abstract. The model will extract its abstract, summarize it, and converts the summary to speech.", examples=[["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"], ["Article 7 Efficient Estimation of Word Representations in Vector Space.pdf"],["Article 6 BloombergGPT_ A Large Language Model for Finance.pdf"]] ) # Run the Gradio app iface.launch()