Spaces:
Build error
Build error
import requests | |
from PIL import Image | |
from transformers import BlipProcessor, BlipForConditionalGeneration, BarkModel, AutoProcessor | |
import torch | |
def image_to_text(img_url): | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
# img_url = '/content/photo.png' | |
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') | |
raw_image = Image.open(img_url).convert('RGB') | |
text = "This is a photo of " | |
inputs = processor(raw_image, text, return_tensors="pt") | |
out = model.generate(**inputs) | |
output_text = processor.decode(out[0], skip_special_tokens=True)) | |
return output_text | |
def text_to_speech(text_prompt): | |
model = BarkModel.from_pretrained("suno/bark-small") | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
processor = AutoProcessor.from_pretrained("suno/bark") | |
# prepare the inputs | |
inputs = processor(text_prompt) | |
# generate speech | |
speech_output = model.generate(**inputs.to(device)) | |
from IPython.display import Audio | |
sampling_rate = model.generation_config.sample_rate | |
return Audio(speech_output[0].cpu().numpy(), rate=sampling_rate) | |
# import scipy | |
# scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy()) | |
import gradio as gr | |
def image_to_speech(image): | |
out_text = image_to_text(image) | |
return text_to_speech(out_text) | |
demo = gr.Interface(fn=image_to_speech, inputs="image", outputs="audio") | |
if __name__ == "__main__": | |
demo.launch(show_api=False) | |