import torch from transformers import BitsAndBytesConfig, pipeline import whisper import gradio as gr import time import warnings import os from gtts import gTTS import nltk nltk.download('punkt') from nltk import sent_tokenize import numpy as np import re import datetime import os import requests import gradio as gr import base64 import io from PIL import Image quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16 ) model_id = "llava-hf/llava-1.5-7b-hf" pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config}) torch.cuda.is_available() DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using torch {torch.__version__} ({DEVICE})") model = whisper.load_model("medium", device=DEVICE) #creates a file to log events tstamp = datetime.datetime.now() tstamp = str(tstamp).replace(' ','_') logfile = f'{tstamp}_log.txt' def writehistory(text): with open(logfile, 'a', encoding='utf-8') as f: f.write(text) f.write('\n') f.close() def img2txt(input_text, input_image): # load the image image = Image.open(input_image) writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}") #creating a default promt for the model if user does not provide one. if type(input_text) == tuple: prompt_instructions = """ Describe the medical condition shown in the image using as much detail as possible and provide a treatment plan for the medical condition """ #passing an instruction promt for the LLM and adding the user's text else: prompt_instructions = """ Act as an expert in medical imagery descriptive analysis. Utilize the information depicted in the provided image to generate a comprehensive description of the observed medical condition. Include detailed observations regarding any anomalies, abnormalities, or notable features present in the image. Your response should be thorough and precise, and provide a treatment plan for the medical condition. """ + input_text writehistory(f"prompt_instructions: {prompt_instructions}") prompt = "USER: \n" + prompt_instructions + "\nASSISTANT:" outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) # extracts the response text if outputs is not None and len(outputs[0]["generated_text"]) > 0: match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"]) if match: #extracting the text after "ASSISTANT:" reply = match.group(1) else: reply = "No response found." else: reply = "No response generated." return reply def transcribe(audio): #checking if the audio input is None or empty if audio is None or audio == '': return ('','',None) # Return empty strings and None audio file # language = 'en' audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) result_text = result.text return result_text #transforming the text to speech def text_to_speech(text, file_path): language = 'en' audioobj = gTTS(text = text, lang = language, slow = False) audioobj.save(file_path) return file_path # A function to handle audio and image inputs def process_inputs(audio_path, image_path): # Process the audio file (assuming this is handled by a function called 'transcribe') speech_to_text_output = transcribe(audio_path) # Handle the image input if image_path: chatgpt_output = img2txt(speech_to_text_output, image_path) else: chatgpt_output = "No image provided." # Assuming 'transcribe' also returns the path to a processed audio file processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different return speech_to_text_output, chatgpt_output, processed_audio_path # Create the interface iface = gr.Interface( fn=process_inputs, inputs=[ gr.Audio(sources=["microphone"], type="filepath"), gr.Image(type="filepath") ], outputs=[ gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio("Temp.mp3") ], title="(Beta) Medical Research Model with Voice-to-Text Feature", description="Upload an image and interact via voice input and audio.(Must give microphone permission)" ) # Launch the interface iface.launch(inline=False, share=True)