Spaces:

Johan014
/

Finalpj2

Runtime error

App Files Files Community

Finalpj2 / app.py

Johan014

Update app.py

f1c7dc3 verified 9 months ago

raw

history blame contribute delete

4.84 kB

	import torch
	from transformers import BitsAndBytesConfig, pipeline

	import whisper
	import gradio as gr
	import time
	import warnings
	import os
	from gtts import gTTS
	import nltk
	nltk.download('punkt')
	from nltk import sent_tokenize
	import numpy as np
	import re
	import datetime
	import os
	import requests
	import gradio as gr
	import base64
	import io
	from PIL import Image

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16
	)

	model_id = "llava-hf/llava-1.5-7b-hf"
	pipe = pipeline("image-to-text",
	model=model_id,
	model_kwargs={"quantization_config": quantization_config})

	torch.cuda.is_available()
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using torch {torch.__version__} ({DEVICE})")

	model = whisper.load_model("medium", device=DEVICE)
	#creates a file to log events
	tstamp = datetime.datetime.now()
	tstamp = str(tstamp).replace(' ','_')
	logfile = f'{tstamp}_log.txt'
	def writehistory(text):
	with open(logfile, 'a', encoding='utf-8') as f:
	f.write(text)
	f.write('\n')
	f.close()

	def img2txt(input_text, input_image):

	# load the image
	image = Image.open(input_image)

	writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
	#creating a default promt for the model if user does not provide one.
	if type(input_text) == tuple:
	prompt_instructions = """
	Describe the medical condition shown in the image using as much detail as possible and provide a treatment plan for the medical condition
	"""
	#passing an instruction promt for the LLM and adding the user's text
	else:
	prompt_instructions = """
	Act as an expert in medical imagery descriptive analysis. Utilize the
	information depicted in the provided image to generate a comprehensive
	description of the observed medical condition. Include detailed observations
	regarding any anomalies, abnormalities, or notable features present in the image.
	Your response should be thorough and precise, and provide a treatment plan for
	the medical condition.
	""" + input_text

	writehistory(f"prompt_instructions: {prompt_instructions}")
	prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"

	outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

	# extracts the response text
	if outputs is not None and len(outputs[0]["generated_text"]) > 0:
	match = re.search(r'ASSISTANT:\s(.)', outputs[0]["generated_text"])
	if match:
	#extracting the text after "ASSISTANT:"
	reply = match.group(1)
	else:
	reply = "No response found."
	else:
	reply = "No response generated."

	return reply

	def transcribe(audio):

	#checking if the audio input is None or empty
	if audio is None or audio == '':
	return ('','',None) # Return empty strings and None audio file

	# language = 'en'

	audio = whisper.load_audio(audio)
	audio = whisper.pad_or_trim(audio)

	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	_, probs = model.detect_language(mel)

	options = whisper.DecodingOptions()
	result = whisper.decode(model, mel, options)
	result_text = result.text

	return result_text

	#transforming the text to speech
	def text_to_speech(text, file_path):
	language = 'en'

	audioobj = gTTS(text = text,
	lang = language,
	slow = False)

	audioobj.save(file_path)

	return file_path

	# A function to handle audio and image inputs
	def process_inputs(audio_path, image_path):
	# Process the audio file (assuming this is handled by a function called 'transcribe')
	speech_to_text_output = transcribe(audio_path)

	# Handle the image input
	if image_path:
	chatgpt_output = img2txt(speech_to_text_output, image_path)
	else:
	chatgpt_output = "No image provided."

	# Assuming 'transcribe' also returns the path to a processed audio file
	processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different

	return speech_to_text_output, chatgpt_output, processed_audio_path

	# Create the interface
	iface = gr.Interface(
	fn=process_inputs,
	inputs=[
	gr.Audio(sources=["microphone"], type="filepath"),
	gr.Image(type="filepath")
	],
	outputs=[
	gr.Textbox(label="Speech to Text"),
	gr.Textbox(label="ChatGPT Output"),
	gr.Audio("Temp.mp3")
	],
	title="(Beta) Medical Research Model with Voice-to-Text Feature",
	description="Upload an image and interact via voice input and audio.(Must give microphone permission)"
	)

	# Launch the interface
	iface.launch(inline=False, share=True)