|
|
|
import PyPDF2 |
|
import re |
|
import torch |
|
from transformers import pipeline |
|
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub |
|
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface |
|
import gradio as gr |
|
import io |
|
import numpy as np |
|
import soundfile as sf |
|
import tempfile |
|
|
|
|
|
|
|
|
|
def extract_and_clean_abstract(uploaded_file): |
|
if uploaded_file is None: |
|
return "No file uploaded." |
|
|
|
|
|
with open(uploaded_file.name, 'rb') as file: |
|
reader = PyPDF2.PdfReader(file) |
|
full_text = "" |
|
for page in reader.pages: |
|
full_text += page.extract_text() |
|
|
|
|
|
pattern = r"(Abstract|ABSTRACT|abstract)(.*?)(Introduction|INTRODUCTION|introduction|1|Keywords|KEYWORDS|keywords)" |
|
match = re.search(pattern, full_text, re.DOTALL) |
|
|
|
if match: |
|
abstract = match.group(2).strip() |
|
else: |
|
return "Abstract not found." |
|
|
|
|
|
cleaned_abstract = abstract.replace('\n', ' ').replace('- ', '') |
|
|
|
return cleaned_abstract |
|
|
|
|
|
def summarize_text(text): |
|
|
|
summarizer = pipeline( |
|
"summarization", |
|
"pszemraj/led-base-book-summary", |
|
device=0 if torch.cuda.is_available() else -1, |
|
) |
|
|
|
|
|
result = summarizer( |
|
text, |
|
min_length=8, |
|
max_length=25, |
|
no_repeat_ngram_size=3, |
|
encoder_no_repeat_ngram_size=3, |
|
repetition_penalty=3.5, |
|
num_beams=4, |
|
do_sample=False, |
|
early_stopping=True, |
|
) |
|
|
|
first_sentence = re.split(r'(?<=[.:;!?])\s', result[0]['summary_text'])[0] |
|
|
|
return first_sentence |
|
|
|
|
|
def text_to_speech(text): |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
models, cfg, task = load_model_ensemble_and_task_from_hf_hub( |
|
"facebook/fastspeech2-en-ljspeech", |
|
arg_overrides={"vocoder": "hifigan", "fp16": False} |
|
) |
|
|
|
|
|
model = models[0].to(device) |
|
|
|
|
|
TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg) |
|
|
|
|
|
generator = task.build_generator([model], cfg) |
|
|
|
|
|
sample = TTSHubInterface.get_model_input(task, text) |
|
sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].to(device) |
|
sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].to(device) |
|
|
|
|
|
wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample) |
|
|
|
|
|
if wav.is_cuda: |
|
wav = wav.cpu() |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: |
|
sf.write(tmp_file.name, wav.numpy(), rate) |
|
return tmp_file.name |
|
|
|
def process_pdf(uploaded_file): |
|
""" |
|
Process the uploaded PDF file to extract, summarize the abstract, and convert it to speech. |
|
""" |
|
abstract = extract_and_clean_abstract(uploaded_file) |
|
summary = summarize_text(abstract) |
|
audio_output = text_to_speech(summary) |
|
return audio_output |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_pdf, |
|
inputs=gr.File(label="Upload PDF"), |
|
outputs=gr.Audio(label="Audio Summary"), |
|
title="PDF Abstract Summary to Speech", |
|
description="Upload only a PDF file that has an abstract. The model will extract its abstract, summarize it, and converts the summary to speech.", |
|
examples=[["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"], ["Article 7 Efficient Estimation of Word Representations in Vector Space.pdf"],["Article 6 BloombergGPT_ A Large Language Model for Finance.pdf"]] |
|
) |
|
|
|
|
|
iface.launch() |
|
|