import torch from transformers import BitsAndBytesConfig, pipeline import whisper import gradio as gr import time import warnings import os from gtts import gTTS from PIL import Image import nltk from nltk import sent_tokenize import re import numpy as np import datetime import subprocess # Suppress warnings warnings.filterwarnings("ignore", category=FutureWarning) # Ensure nltk punkt is downloaded nltk.download('punkt') # Model ID model_id = "llava-hf/llava-1.5-7b-hf" # Load the pipeline without GPU-dependent quantization pipe = pipeline("image-to-text", model=model_id) # Initialize Whisper model DEVICE = "cuda" if torch.cuda.is_available() else "cpu" model = whisper.load_model("medium", device=DEVICE) # Function to log history def writehistory(text): tstamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') logfile = f'{tstamp}_log.txt' with open(logfile, 'a', encoding='utf-8') as f: f.write(text + '\n') # Function to process image and generate description def img2txt(input_text, input_image): image = Image.open(input_image) if type(input_text) == tuple: prompt_instructions = "Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?" else: prompt_instructions = f"Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt: {input_text}" prompt = f"USER: \n{prompt_instructions}\nASSISTANT:" outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) if outputs and len(outputs[0]["generated_text"]) > 0: match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"]) reply = match.group(1) if match else "No response found." else: reply = "No response generated." return reply # Function to transcribe audio def transcribe(audio): if not audio: return '', '', None audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) return result.text # Function to convert text to speech def text_to_speech(text, file_path): audioobj = gTTS(text=text, lang='en', slow=False) audioobj.save(file_path) return file_path # Generate a silent audio file for initial output command = ['ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=mono', '-t', '10', '-q:a', '9', '-acodec', 'libmp3lame', 'Temp.mp3'] subprocess.run(command, check=True) # Function to handle inputs and generate outputs def process_inputs(audio_path, image_path): speech_to_text_output = transcribe(audio_path) chatgpt_output = img2txt(speech_to_text_output, image_path) if image_path else "No image provided." processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") return speech_to_text_output, chatgpt_output, processed_audio_path # Create the Gradio interface iface = gr.Interface( fn=process_inputs, inputs=[gr.Audio(sources=["microphone"], type="filepath"), gr.Image(type="filepath")], outputs=[gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio("Temp.mp3")], title="Multilanguage Voice Assistant App", description="Upload an image and interact via voice input and audio response." ) # Launch the interface iface.launch(debug=True, share=True)