Spaces:

PRIYANSHUDHAKED
/

Multilanguage-voice-assistant-using-llava

Runtime error

App Files Files Community

Multilanguage-voice-assistant-using-llava / app.py

PRIYANSHUDHAKED

Update app.py

b4e9767 verified 22 days ago

raw

history blame contribute delete

No virus

3.5 kB

	import torch
	from transformers import BitsAndBytesConfig, pipeline
	import whisper
	import gradio as gr
	import time
	import warnings
	import os
	from gtts import gTTS
	from PIL import Image
	import nltk
	from nltk import sent_tokenize
	import re
	import numpy as np
	import datetime
	import subprocess

	# Suppress warnings
	warnings.filterwarnings("ignore", category=FutureWarning)

	# Ensure nltk punkt is downloaded
	nltk.download('punkt')

	# Model ID
	model_id = "llava-hf/llava-1.5-7b-hf"

	# Load the pipeline without GPU-dependent quantization
	pipe = pipeline("image-to-text", model=model_id)

	# Initialize Whisper model
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	model = whisper.load_model("medium", device=DEVICE)

	# Function to log history
	def writehistory(text):
	tstamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
	logfile = f'{tstamp}_log.txt'
	with open(logfile, 'a', encoding='utf-8') as f:
	f.write(text + '\n')

	# Function to process image and generate description
	def img2txt(input_text, input_image):
	image = Image.open(input_image)
	if type(input_text) == tuple:
	prompt_instructions = "Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?"
	else:
	prompt_instructions = f"Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt: {input_text}"

	prompt = f"USER: <image>\n{prompt_instructions}\nASSISTANT:"

	outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

	if outputs and len(outputs[0]["generated_text"]) > 0:
	match = re.search(r'ASSISTANT:\s(.)', outputs[0]["generated_text"])
	reply = match.group(1) if match else "No response found."
	else:
	reply = "No response generated."

	return reply

	# Function to transcribe audio
	def transcribe(audio):
	if not audio:
	return '', '', None

	audio = whisper.load_audio(audio)
	audio = whisper.pad_or_trim(audio)
	mel = whisper.log_mel_spectrogram(audio).to(model.device)
	options = whisper.DecodingOptions()
	result = whisper.decode(model, mel, options)
	return result.text

	# Function to convert text to speech
	def text_to_speech(text, file_path):
	audioobj = gTTS(text=text, lang='en', slow=False)
	audioobj.save(file_path)
	return file_path

	# Generate a silent audio file for initial output
	command = ['ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=mono', '-t', '10', '-q:a', '9', '-acodec', 'libmp3lame', 'Temp.mp3']
	subprocess.run(command, check=True)

	# Function to handle inputs and generate outputs
	def process_inputs(audio_path, image_path):
	speech_to_text_output = transcribe(audio_path)
	chatgpt_output = img2txt(speech_to_text_output, image_path) if image_path else "No image provided."
	processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3")
	return speech_to_text_output, chatgpt_output, processed_audio_path

	# Create the Gradio interface
	iface = gr.Interface(
	fn=process_inputs,
	inputs=[gr.Audio(sources=["microphone"], type="filepath"), gr.Image(type="filepath")],
	outputs=[gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio("Temp.mp3")],
	title="Multilanguage Voice Assistant App",
	description="Upload an image and interact via voice input and audio response."
	)

	# Launch the interface
	iface.launch(debug=True, share=True)