import os import torch from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import PyPDF2 import sounddevice as sd import numpy as np from gtts import gTTS from io import BytesIO import gradio as gr def load_quantized_model(model_name): model = AutoModelForSeq2SeqLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Quantize the model model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) model.eval() return model, tokenizer def pdf_to_text(pdf_bytes): pdf_file_obj = BytesIO(pdf_bytes) pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj) text = '' for page_num in range(pdf_reader.numPages): page_obj = pdf_reader.getPage(page_num) text += page_obj.extractText() pdf_file_obj.close() return text def generate_audio(model, tokenizer, text): input_ids = torch.tensor(tokenizer.encode(text, return_tensors="pt")).cuda() with torch.no_grad(): outputs = model.generate(input_ids, max_length=500, pad_token_id=tokenizer.eos_token_id) output_text = tokenizer.decode(outputs[0]) return output_text def save_and_play_audio(text): tts = gTTS(text=text, lang='en') output_file = "output.mp3" tts.save(output_file) data, fs = sd.default.read_audio(output_file) sd.play(data, fs) sd.wait() return output_file def main(pdf_file): # Load the quantized model model, tokenizer = load_quantized_model("microsoft/speecht5_tts") # Move the model to the GPU if available if torch.cuda.is_available(): model.cuda() # Convert the uploaded PDF file to text text = pdf_to_text(pdf_file.read()) # Generate audio from the text audio_text = generate_audio(model, tokenizer, text) # Save and play the audio output_file = save_and_play_audio(audio_text) return {"output_file": output_file} if __name__ == "__main__": app = gr.Interface(main, inputs=gr.inputs.File(type="pdf"), outputs="text") app.launch()