|
import torch |
|
from transformers import BitsAndBytesConfig, pipeline |
|
|
|
import whisper |
|
import gradio as gr |
|
import time |
|
import warnings |
|
import os |
|
from gtts import gTTS |
|
import nltk |
|
nltk.download('punkt') |
|
from nltk import sent_tokenize |
|
import numpy as np |
|
import re |
|
import datetime |
|
import os |
|
import requests |
|
import gradio as gr |
|
import base64 |
|
import io |
|
from PIL import Image |
|
|
|
quantization_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_compute_dtype=torch.float16 |
|
) |
|
|
|
model_id = "llava-hf/llava-1.5-7b-hf" |
|
pipe = pipeline("image-to-text", |
|
model=model_id, |
|
model_kwargs={"quantization_config": quantization_config}) |
|
|
|
torch.cuda.is_available() |
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(f"Using torch {torch.__version__} ({DEVICE})") |
|
|
|
model = whisper.load_model("medium", device=DEVICE) |
|
|
|
tstamp = datetime.datetime.now() |
|
tstamp = str(tstamp).replace(' ','_') |
|
logfile = f'{tstamp}_log.txt' |
|
def writehistory(text): |
|
with open(logfile, 'a', encoding='utf-8') as f: |
|
f.write(text) |
|
f.write('\n') |
|
f.close() |
|
|
|
def img2txt(input_text, input_image): |
|
|
|
|
|
image = Image.open(input_image) |
|
|
|
writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}") |
|
|
|
if type(input_text) == tuple: |
|
prompt_instructions = """ |
|
Describe the medical condition shown in the image using as much detail as possible and provide a treatment plan for the medical condition |
|
""" |
|
|
|
else: |
|
prompt_instructions = """ |
|
Act as an expert in medical imagery descriptive analysis. Utilize the |
|
information depicted in the provided image to generate a comprehensive |
|
description of the observed medical condition. Include detailed observations |
|
regarding any anomalies, abnormalities, or notable features present in the image. |
|
Your response should be thorough and precise, and provide a treatment plan for |
|
the medical condition. |
|
""" + input_text |
|
|
|
writehistory(f"prompt_instructions: {prompt_instructions}") |
|
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:" |
|
|
|
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) |
|
|
|
|
|
if outputs is not None and len(outputs[0]["generated_text"]) > 0: |
|
match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"]) |
|
if match: |
|
|
|
reply = match.group(1) |
|
else: |
|
reply = "No response found." |
|
else: |
|
reply = "No response generated." |
|
|
|
return reply |
|
|
|
def transcribe(audio): |
|
|
|
|
|
if audio is None or audio == '': |
|
return ('','',None) |
|
|
|
|
|
|
|
audio = whisper.load_audio(audio) |
|
audio = whisper.pad_or_trim(audio) |
|
|
|
mel = whisper.log_mel_spectrogram(audio).to(model.device) |
|
|
|
_, probs = model.detect_language(mel) |
|
|
|
options = whisper.DecodingOptions() |
|
result = whisper.decode(model, mel, options) |
|
result_text = result.text |
|
|
|
return result_text |
|
|
|
|
|
def text_to_speech(text, file_path): |
|
language = 'en' |
|
|
|
audioobj = gTTS(text = text, |
|
lang = language, |
|
slow = False) |
|
|
|
audioobj.save(file_path) |
|
|
|
return file_path |
|
|
|
|
|
def process_inputs(audio_path, image_path): |
|
|
|
speech_to_text_output = transcribe(audio_path) |
|
|
|
|
|
if image_path: |
|
chatgpt_output = img2txt(speech_to_text_output, image_path) |
|
else: |
|
chatgpt_output = "No image provided." |
|
|
|
|
|
processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") |
|
|
|
return speech_to_text_output, chatgpt_output, processed_audio_path |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_inputs, |
|
inputs=[ |
|
gr.Audio(sources=["microphone"], type="filepath"), |
|
gr.Image(type="filepath") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Speech to Text"), |
|
gr.Textbox(label="ChatGPT Output"), |
|
gr.Audio("Temp.mp3") |
|
], |
|
title="(Beta) Medical Research Model with Voice-to-Text Feature", |
|
description="Upload an image and interact via voice input and audio.(Must give microphone permission)" |
|
) |
|
|
|
|
|
iface.launch(inline=False, share=True) |
|
|