Spaces:
Sleeping
Sleeping
| # Use a pipeline as a high-level helper | |
| import torch | |
| from transformers import pipeline | |
| from scipy.io import wavfile | |
| from PIL import Image | |
| import gradio as gr | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| image_pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large",device=device) | |
| narator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs",device=device) | |
| def generate_audio(text): | |
| # generate the audio from the text | |
| audio_text = narator(text) | |
| # save the audio to a WAV file | |
| wavfile.write(filename="audio.wav", | |
| rate=audio_text['sampling_rate'], | |
| data=audio_text['audio'][0]) | |
| return "audio.wav" | |
| def caption_my_image(image_path): | |
| image = image_pipe(image_path) | |
| caption_text = image[0]['generated_text'] | |
| return generate_audio(caption_text) | |
| demo = gr.Interface(fn=caption_my_image, | |
| inputs=[gr.Image(label="Image",type="pil")], | |
| outputs=[gr.Audio(label="Image Caption")], | |
| title="@SmartChoiceLearningHub HF Project 1 :Image to Text to Speech", | |
| description="This app generates a caption for an image and converts the caption to speech.") | |
| demo.launch() |