Finalpj2 / app.py
Johan014's picture
Update app.py
f1c7dc3 verified
import torch
from transformers import BitsAndBytesConfig, pipeline
import whisper
import gradio as gr
import time
import warnings
import os
from gtts import gTTS
import nltk
nltk.download('punkt')
from nltk import sent_tokenize
import numpy as np
import re
import datetime
import os
import requests
import gradio as gr
import base64
import io
from PIL import Image
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text",
model=model_id,
model_kwargs={"quantization_config": quantization_config})
torch.cuda.is_available()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using torch {torch.__version__} ({DEVICE})")
model = whisper.load_model("medium", device=DEVICE)
#creates a file to log events
tstamp = datetime.datetime.now()
tstamp = str(tstamp).replace(' ','_')
logfile = f'{tstamp}_log.txt'
def writehistory(text):
with open(logfile, 'a', encoding='utf-8') as f:
f.write(text)
f.write('\n')
f.close()
def img2txt(input_text, input_image):
# load the image
image = Image.open(input_image)
writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
#creating a default promt for the model if user does not provide one.
if type(input_text) == tuple:
prompt_instructions = """
Describe the medical condition shown in the image using as much detail as possible and provide a treatment plan for the medical condition
"""
#passing an instruction promt for the LLM and adding the user's text
else:
prompt_instructions = """
Act as an expert in medical imagery descriptive analysis. Utilize the
information depicted in the provided image to generate a comprehensive
description of the observed medical condition. Include detailed observations
regarding any anomalies, abnormalities, or notable features present in the image.
Your response should be thorough and precise, and provide a treatment plan for
the medical condition.
""" + input_text
writehistory(f"prompt_instructions: {prompt_instructions}")
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
# extracts the response text
if outputs is not None and len(outputs[0]["generated_text"]) > 0:
match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
if match:
#extracting the text after "ASSISTANT:"
reply = match.group(1)
else:
reply = "No response found."
else:
reply = "No response generated."
return reply
def transcribe(audio):
#checking if the audio input is None or empty
if audio is None or audio == '':
return ('','',None) # Return empty strings and None audio file
# language = 'en'
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
result_text = result.text
return result_text
#transforming the text to speech
def text_to_speech(text, file_path):
language = 'en'
audioobj = gTTS(text = text,
lang = language,
slow = False)
audioobj.save(file_path)
return file_path
# A function to handle audio and image inputs
def process_inputs(audio_path, image_path):
# Process the audio file (assuming this is handled by a function called 'transcribe')
speech_to_text_output = transcribe(audio_path)
# Handle the image input
if image_path:
chatgpt_output = img2txt(speech_to_text_output, image_path)
else:
chatgpt_output = "No image provided."
# Assuming 'transcribe' also returns the path to a processed audio file
processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different
return speech_to_text_output, chatgpt_output, processed_audio_path
# Create the interface
iface = gr.Interface(
fn=process_inputs,
inputs=[
gr.Audio(sources=["microphone"], type="filepath"),
gr.Image(type="filepath")
],
outputs=[
gr.Textbox(label="Speech to Text"),
gr.Textbox(label="ChatGPT Output"),
gr.Audio("Temp.mp3")
],
title="(Beta) Medical Research Model with Voice-to-Text Feature",
description="Upload an image and interact via voice input and audio.(Must give microphone permission)"
)
# Launch the interface
iface.launch(inline=False, share=True)