Talk-To-PDF / app.py
nirajandhakal's picture
Update app.py
4e8fa7f verified
raw history blame
No virus
2.04 kB
import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import PyPDF2
import sounddevice as sd
import numpy as np
from gtts import gTTS
from io import BytesIO
import gradio as gr
def load_quantized_model(model_name):
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Quantize the model
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
model.eval()
return model, tokenizer
def pdf_to_text(pdf_bytes):
pdf_file_obj = BytesIO(pdf_bytes)
pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
text = ''
for page_num in range(pdf_reader.numPages):
page_obj = pdf_reader.getPage(page_num)
text += page_obj.extractText()
pdf_file_obj.close()
return text
def generate_audio(model, tokenizer, text):
input_ids = torch.tensor(tokenizer.encode(text, return_tensors="pt")).cuda()
with torch.no_grad():
outputs = model.generate(input_ids, max_length=500, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(outputs[0])
return output_text
def save_and_play_audio(text):
tts = gTTS(text=text, lang='en')
output_file = "output.mp3"
tts.save(output_file)
data, fs = sd.default.read_audio(output_file)
sd.play(data, fs)
sd.wait()
return output_file
def main(pdf_file):
# Load the quantized model
model, tokenizer = load_quantized_model("microsoft/speecht5_tts")
# Move the model to the GPU if available
if torch.cuda.is_available():
model.cuda()
# Convert the uploaded PDF file to text
text = pdf_to_text(pdf_file.read())
# Generate audio from the text
audio_text = generate_audio(model, tokenizer, text)
# Save and play the audio
output_file = save_and_play_audio(audio_text)
return {"output_file": output_file}
if __name__ == "__main__":
app = gr.Interface(main, inputs=gr.inputs.File(type="pdf"), outputs="text")
app.launch()