|
import torch |
|
from transformers import BitsAndBytesConfig, pipeline |
|
import whisper |
|
import gradio as gr |
|
import time |
|
import warnings |
|
import os |
|
from gtts import gTTS |
|
from PIL import Image |
|
import nltk |
|
from nltk import sent_tokenize |
|
import re |
|
import numpy as np |
|
import datetime |
|
import subprocess |
|
|
|
|
|
warnings.filterwarnings("ignore", category=FutureWarning) |
|
|
|
|
|
nltk.download('punkt') |
|
|
|
|
|
model_id = "llava-hf/llava-1.5-7b-hf" |
|
|
|
|
|
pipe = pipeline("image-to-text", model=model_id) |
|
|
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
model = whisper.load_model("medium", device=DEVICE) |
|
|
|
|
|
def writehistory(text): |
|
tstamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') |
|
logfile = f'{tstamp}_log.txt' |
|
with open(logfile, 'a', encoding='utf-8') as f: |
|
f.write(text + '\n') |
|
|
|
|
|
def img2txt(input_text, input_image): |
|
image = Image.open(input_image) |
|
if type(input_text) == tuple: |
|
prompt_instructions = "Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?" |
|
else: |
|
prompt_instructions = f"Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt: {input_text}" |
|
|
|
prompt = f"USER: <image>\n{prompt_instructions}\nASSISTANT:" |
|
|
|
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) |
|
|
|
if outputs and len(outputs[0]["generated_text"]) > 0: |
|
match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"]) |
|
reply = match.group(1) if match else "No response found." |
|
else: |
|
reply = "No response generated." |
|
|
|
return reply |
|
|
|
|
|
def transcribe(audio): |
|
if not audio: |
|
return '', '', None |
|
|
|
audio = whisper.load_audio(audio) |
|
audio = whisper.pad_or_trim(audio) |
|
mel = whisper.log_mel_spectrogram(audio).to(model.device) |
|
options = whisper.DecodingOptions() |
|
result = whisper.decode(model, mel, options) |
|
return result.text |
|
|
|
|
|
def text_to_speech(text, file_path): |
|
audioobj = gTTS(text=text, lang='en', slow=False) |
|
audioobj.save(file_path) |
|
return file_path |
|
|
|
|
|
command = ['ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=mono', '-t', '10', '-q:a', '9', '-acodec', 'libmp3lame', 'Temp.mp3'] |
|
subprocess.run(command, check=True) |
|
|
|
|
|
def process_inputs(audio_path, image_path): |
|
speech_to_text_output = transcribe(audio_path) |
|
chatgpt_output = img2txt(speech_to_text_output, image_path) if image_path else "No image provided." |
|
processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") |
|
return speech_to_text_output, chatgpt_output, processed_audio_path |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_inputs, |
|
inputs=[gr.Audio(sources=["microphone"], type="filepath"), gr.Image(type="filepath")], |
|
outputs=[gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio("Temp.mp3")], |
|
title="Multilanguage Voice Assistant App", |
|
description="Upload an image and interact via voice input and audio response." |
|
) |
|
|
|
|
|
iface.launch(debug=True, share=True) |
|
|