PH83's picture
c2
307cb9b
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, BarkModel, AutoProcessor
import torch
def image_to_text(img_url):
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
# img_url = '/content/photo.png'
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
raw_image = Image.open(img_url).convert('RGB')
text = "This is a photo of "
inputs = processor(raw_image, text, return_tensors="pt")
out = model.generate(**inputs)
output_text = processor.decode(out[0], skip_special_tokens=True))
return output_text
def text_to_speech(text_prompt):
model = BarkModel.from_pretrained("suno/bark-small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
processor = AutoProcessor.from_pretrained("suno/bark")
# prepare the inputs
inputs = processor(text_prompt)
# generate speech
speech_output = model.generate(**inputs.to(device))
from IPython.display import Audio
sampling_rate = model.generation_config.sample_rate
return Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)
# import scipy
# scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy())
import gradio as gr
def image_to_speech(image):
out_text = image_to_text(image)
return text_to_speech(out_text)
demo = gr.Interface(fn=image_to_speech, inputs="image", outputs="audio")
if __name__ == "__main__":
demo.launch(show_api=False)