import requests from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration, BarkModel, AutoProcessor import torch def image_to_text(img_url): processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") # img_url = '/content/photo.png' # raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') raw_image = Image.open(img_url).convert('RGB') text = "This is a photo of " inputs = processor(raw_image, text, return_tensors="pt") out = model.generate(**inputs) output_text = processor.decode(out[0], skip_special_tokens=True)) return output_text def text_to_speech(text_prompt): model = BarkModel.from_pretrained("suno/bark-small") device = "cuda:0" if torch.cuda.is_available() else "cpu" model = model.to(device) processor = AutoProcessor.from_pretrained("suno/bark") # prepare the inputs inputs = processor(text_prompt) # generate speech speech_output = model.generate(**inputs.to(device)) from IPython.display import Audio sampling_rate = model.generation_config.sample_rate return Audio(speech_output[0].cpu().numpy(), rate=sampling_rate) # import scipy # scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy()) import gradio as gr def image_to_speech(image): out_text = image_to_text(image) return text_to_speech(out_text) demo = gr.Interface(fn=image_to_speech, inputs="image", outputs="audio") if __name__ == "__main__": demo.launch(show_api=False)